From 38d4f5b4c985f1d865f1538b15fe9b436de19970 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 3 Nov 2020 20:09:54 +0100 Subject: [PATCH 001/524] Do not fail only empty courses --- PFERD/organizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 1665f23..87bc684 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -124,6 +124,8 @@ class Organizer(Location): self._cleanup(self.path) def _cleanup(self, start_dir: Path) -> None: + if not start_dir.exists(): + return paths: List[Path] = list(start_dir.iterdir()) # Recursively clean paths From f4abe3197ca976dfa8d075225dd5c17e9bde0d63 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 3 Nov 2020 20:40:09 +0100 Subject: [PATCH 002/524] Add ipd crawler --- PFERD/ipd.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++ PFERD/pferd.py | 51 +++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 PFERD/ipd.py diff --git a/PFERD/ipd.py b/PFERD/ipd.py new file mode 100644 index 0000000..33aaff1 --- /dev/null +++ b/PFERD/ipd.py @@ -0,0 +1,150 @@ +""" +Utility functions and a scraper/downloader for the IPD pages. +""" +import datetime +import logging +import math +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, List, Optional +from urllib.parse import urljoin + +import bs4 +import requests + +from PFERD.errors import FatalException +from PFERD.utils import soupify + +from .logging import PrettyLogger +from .organizer import Organizer +from .tmp_dir import TmpDir +from .transform import Transformable +from .utils import stream_to_path + +LOGGER = logging.getLogger(__name__) +PRETTY = PrettyLogger(LOGGER) + + +@dataclass +class IpdDownloadInfo(Transformable): + """ + Information about an ipd entry. + """ + url: str + modification_date: Optional[datetime.datetime] + + +IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool] + + +def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool: + """ + Accepts new files or files with a more recent modification date. + """ + resolved_file = organizer.resolve(info.path) + if not resolved_file.exists(): + return True + if not info.modification_date: + PRETTY.ignored_file(info.path, "could not find modification time, file exists") + return False + + resolved_mod_time_seconds = resolved_file.stat().st_mtime + + # Download if the info is newer + if info.modification_date.timestamp() > resolved_mod_time_seconds: + return True + + PRETTY.ignored_file(info.path, "local file has newer or equal modification time") + return False + + +class IpdCrawler: + # pylint: disable=too-few-public-methods + """ + A crawler for IPD pages. + """ + + def __init__(self, base_url: str): + self._base_url = base_url + + def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: + """ + Create an absolute url from an tag. + """ + return urljoin(self._base_url, link_tag.get("href")) + + def crawl(self) -> List[IpdDownloadInfo]: + """ + Crawls the playlist given in the constructor. + """ + page = soupify(requests.get(self._base_url)) + + items: List[IpdDownloadInfo] = [] + + for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}): + href: str = link.attrs.get("href") + name = href.split("/")[-1] + + modification_date: Optional[datetime.datetime] + try: + enclosing_row: bs4.Tag = link.findParent(name="tr") + date_text = enclosing_row.find(name="td").text + modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") + except ValueError: + modification_date = None + + items.append(IpdDownloadInfo( + Path(name), + url=self._abs_url_from_link(link), + modification_date=modification_date + )) + + return items + + +class IpdDownloader: + """ + A downloader for ipd files. + """ + + def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy): + self._tmp_dir = tmp_dir + self._organizer = organizer + self._strategy = strategy + self._session = requests.session() + + def download_all(self, infos: List[IpdDownloadInfo]) -> None: + """ + Download multiple files one after the other. + """ + for info in infos: + self.download(info) + + def download(self, info: IpdDownloadInfo) -> None: + """ + Download a single file. + """ + if not self._strategy(self._organizer, info): + self._organizer.mark(info.path) + return + + with self._session.get(info.url, stream=True) as response: + if response.status_code == 200: + tmp_file = self._tmp_dir.new_path() + stream_to_path(response, tmp_file, info.path.name) + dst_path = self._organizer.accept_file(tmp_file, info.path) + + if dst_path and info.modification_date: + os.utime( + dst_path, + times=( + math.ceil(info.modification_date.timestamp()), + math.ceil(info.modification_date.timestamp()) + ) + ) + + elif response.status_code == 403: + raise FatalException("Received 403. Are you not using the KIT VPN?") + else: + PRETTY.warning(f"Could not download file, got response {response.status_code}") diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 042dd93..f57f078 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -14,6 +14,8 @@ from .errors import FatalException, swallow_and_print_errors from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, KitShibbolethAuthenticator, download_modified_or_new) +from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo, + IpdDownloadStrategy, ipd_download_new_or_modified) from .location import Location from .logging import PrettyLogger, enable_logging from .organizer import Organizer @@ -294,6 +296,55 @@ class Pferd(Location): return organizer + @swallow_and_print_errors + def ipd_kit( + self, + target: Union[PathLike, Organizer], + url: str, + transform: Transform = lambda x: x, + download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified, + clean: bool = True + ) -> Organizer: + """ + Synchronizes a folder with a DIVA playlist. + + Arguments: + target {Union[PathLike, Organizer]} -- The organizer / target folder to use. + url {str} -- the url to the page + + Keyword Arguments: + transform {Transform} -- A transformation function for the output paths. Return None + to ignore a file. (default: {lambdax:x}) + download_strategy {DivaDownloadStrategy} -- A function to determine which files need to + be downloaded. Can save bandwidth and reduce the number of requests. + (default: {diva_download_new}) + clean {bool} -- Whether to clean up when the method finishes. + """ + tmp_dir = self._tmp_dir.new_subdir() + + if target is None: + PRETTY.starting_synchronizer("None", "IPD", url) + raise FatalException("Got 'None' as target directory, aborting") + + if isinstance(target, Organizer): + organizer = target + else: + organizer = Organizer(self.resolve(to_path(target))) + + PRETTY.starting_synchronizer(organizer.path, "IPD", url) + + elements: List[IpdDownloadInfo] = IpdCrawler(url).crawl() + transformed = apply_transform(transform, elements) + + if self._test_run: + self._print_transformables(transformed) + return organizer + + downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy) + downloader.download_all(transformed) + + return organizer + @swallow_and_print_errors def diva_kit( self, From 0da2fafcd8b7dbe258775cb5ff4f84fa671b846f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 14:38:15 +0100 Subject: [PATCH 003/524] Fix links outside tables --- PFERD/ipd.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PFERD/ipd.py b/PFERD/ipd.py index 33aaff1..d602e0e 100644 --- a/PFERD/ipd.py +++ b/PFERD/ipd.py @@ -86,11 +86,12 @@ class IpdCrawler: href: str = link.attrs.get("href") name = href.split("/")[-1] - modification_date: Optional[datetime.datetime] + modification_date: Optional[datetime.datetime] = None try: enclosing_row: bs4.Tag = link.findParent(name="tr") - date_text = enclosing_row.find(name="td").text - modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") + if enclosing_row: + date_text = enclosing_row.find(name="td").text + modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") except ValueError: modification_date = None From ef343dec7c9ad7554ffbaf0b7301ce99666caaa9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 15:06:58 +0100 Subject: [PATCH 004/524] Merge organizer download summaries --- PFERD/pferd.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index f57f078..c01b5fd 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -343,6 +343,11 @@ class Pferd(Location): downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy) downloader.download_all(transformed) + if clean: + organizer.cleanup() + + self._download_summary.merge(organizer.download_summary) + return organizer @swallow_and_print_errors @@ -403,4 +408,6 @@ class Pferd(Location): if clean: organizer.cleanup() + self._download_summary.merge(organizer.download_summary) + return organizer From f830b42a3600519393341c6e720ae4612cbad75a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 21:49:35 +0100 Subject: [PATCH 005/524] Fix duplicate files in download summary --- PFERD/download_summary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PFERD/download_summary.py b/PFERD/download_summary.py index 28d51b5..c8135de 100644 --- a/PFERD/download_summary.py +++ b/PFERD/download_summary.py @@ -40,9 +40,9 @@ class DownloadSummary: """ Merges ourselves with the passed summary. Modifies this object, but not the passed one. """ - self._new_files += summary.new_files - self._modified_files += summary.modified_files - self._deleted_files += summary.deleted_files + self._new_files = list(set(self._new_files + summary.new_files)) + self._modified_files = list(set(self._modified_files + summary.modified_files)) + self._deleted_files = list(set(self._deleted_files + summary.deleted_files)) def add_deleted_file(self, path: Path) -> None: """ From 6f78fef6047886c5ce3a3aa1b0bee99157a554d7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 22:08:33 +0100 Subject: [PATCH 006/524] Add quoting instructions to README --- .github/workflows/package.yml | 2 +- README.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index c451789..c217735 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -67,7 +67,7 @@ jobs: - name: "Upload release artifacts" uses: softprops/action-gh-release@v1 with: - body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x `." + body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x `. Also please *quote the url you pass to the program* or your shell might silently screw it up!" files: | pferd_sync_url_mac pferd_sync_url_linux diff --git a/README.md b/README.md index a1cd1dd..d82f557 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ use, but doesn't expose all the configuration options and tweaks a full install does. 1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest). -2. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option. +2. Recognize that you most likely need to enclose the URL in `''` quotes to prevent your shell from interpreting `&` and other symbols +3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option. If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x `. If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/)) From 6f30adcd2292556c5906976230dc27e7722417ad Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 22:12:33 +0100 Subject: [PATCH 007/524] Fix quote type in README --- .github/workflows/package.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index c217735..1c0c353 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -67,7 +67,7 @@ jobs: - name: "Upload release artifacts" uses: softprops/action-gh-release@v1 with: - body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x `. Also please *quote the url you pass to the program* or your shell might silently screw it up!" + body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x `. Also please enclose the *url you pass to the program in double quotes* or your shell might silently screw it up!" files: | pferd_sync_url_mac pferd_sync_url_linux diff --git a/README.md b/README.md index d82f557..2df0722 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ use, but doesn't expose all the configuration options and tweaks a full install does. 1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest). -2. Recognize that you most likely need to enclose the URL in `''` quotes to prevent your shell from interpreting `&` and other symbols +2. Recognize that you most likely need to enclose the URL in `""` quotes to prevent your shell from interpreting `&` and other symbols 3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option. If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x `. If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/)) From 316b9d7bf4bfd864fa9ca8cb8fb3c2ca995d137f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 22:20:40 +0100 Subject: [PATCH 008/524] Prevent too many retries when fetching an ILIAS page --- PFERD/ilias/crawler.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 7ce460e..036a479 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -593,10 +593,17 @@ class IliasCrawler: return results - def _get_page(self, url: str, params: Dict[str, Any]) -> bs4.BeautifulSoup: + def _get_page(self, url: str, params: Dict[str, Any], + retry_count: int = 0) -> bs4.BeautifulSoup: """ Fetches a page from ILIAS, authenticating when needed. """ + + if retry_count >= 4: + raise FatalException("Could not get a proper page after 4 tries. " + "Maybe your URL is wrong, authentication fails continuously, " + "your ILIAS connection is spotty or ILIAS is not well.") + LOGGER.debug("Fetching %r", url) response = self._session.get(url, params=params) @@ -617,7 +624,7 @@ class IliasCrawler: self._authenticator.authenticate(self._session) - return self._get_page(url, params) + return self._get_page(url, params, retry_count + 1) @staticmethod def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: From 9c4759103a0d80b17161f58ee8776d2409c46999 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 Nov 2020 11:25:06 +0100 Subject: [PATCH 009/524] Bump patch version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2df0722..babd760 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.1 +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.1 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.1/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.2/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index bac40d9..526669a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.4.1", + version="2.4.2", packages=find_packages(), install_requires=[ "requests>=2.21.0", From f2aba970fd572161f614f773916f85d03d8dc34d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 10 Nov 2020 15:27:12 +0100 Subject: [PATCH 010/524] [sync_url] Sanitize path names on windows --- sync_url.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sync_url.py b/sync_url.py index d2dce94..cb837a2 100755 --- a/sync_url.py +++ b/sync_url.py @@ -5,16 +5,27 @@ A simple script to download a course by name from ILIAS. """ import argparse -from pathlib import Path +import os +import re +from pathlib import Path, PurePath +from typing import Optional from urllib.parse import urlparse from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) +from PFERD.transform import re_rename from PFERD.utils import to_path +def sanitize_path(path: PurePath) -> Optional[PurePath]: + # Escape windows illegal path characters + if os.name == 'nt': + return PurePath(re.sub(r'[<>:"/\\|?]', "", str(path))) + return path + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--test-run", action="store_true") @@ -59,7 +70,8 @@ def main() -> None: target=folder, full_url=args.url, cookies=args.cookies, - dir_filter=dir_filter + dir_filter=dir_filter, + transform=sanitize_path ) From 4ac51048c115e2dfc4c04228cda65b4c16346daf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 10 Nov 2020 20:49:14 +0100 Subject: [PATCH 011/524] Use "_" as a replacement for illegal characters --- sync_url.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sync_url.py b/sync_url.py index cb837a2..ebb635b 100755 --- a/sync_url.py +++ b/sync_url.py @@ -15,14 +15,13 @@ from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) -from PFERD.transform import re_rename from PFERD.utils import to_path def sanitize_path(path: PurePath) -> Optional[PurePath]: # Escape windows illegal path characters if os.name == 'nt': - return PurePath(re.sub(r'[<>:"/\\|?]', "", str(path))) + return PurePath(re.sub(r'[<>:"/\\|?]', "_", str(path))) return path From 733e1ae136d69e9885046858cc907970dc6884be Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 10 Nov 2020 20:50:31 +0100 Subject: [PATCH 012/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index babd760..e35f209 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.3 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.2 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.2/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.3 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.3/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 526669a..8335f7f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.4.2", + version="2.4.3", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 1486a63854d38662035630efbccb29d5ccb931a9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 10 Nov 2020 22:53:47 +0100 Subject: [PATCH 013/524] Do not collapse directory structure when sanitizing --- sync_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index ebb635b..998584f 100755 --- a/sync_url.py +++ b/sync_url.py @@ -21,7 +21,7 @@ from PFERD.utils import to_path def sanitize_path(path: PurePath) -> Optional[PurePath]: # Escape windows illegal path characters if os.name == 'nt': - return PurePath(re.sub(r'[<>:"/\\|?]', "_", str(path))) + return PurePath(re.sub(r'[<>:"/|?]', "_", str(path))) return path From a0ae9aee2730a75558d525bdb1d36f89f9da27ae Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 11 Nov 2020 09:36:20 +0100 Subject: [PATCH 014/524] Sanitize individual path parts --- sync_url.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index 998584f..2b8bc56 100755 --- a/sync_url.py +++ b/sync_url.py @@ -21,7 +21,8 @@ from PFERD.utils import to_path def sanitize_path(path: PurePath) -> Optional[PurePath]: # Escape windows illegal path characters if os.name == 'nt': - return PurePath(re.sub(r'[<>:"/|?]', "_", str(path))) + sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] + return PurePath(*sanitized_parts) return path From 55e9e719ad405171d7f8de66a4831bd7f659d9fe Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 12 Nov 2020 19:32:45 +0100 Subject: [PATCH 015/524] Sanitize "/" in ilias path names --- PFERD/ilias/crawler.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 036a479..f5b1ae8 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -26,6 +26,10 @@ LOGGER = logging.getLogger(__name__) PRETTY = PrettyLogger(LOGGER) +def _sanitize_path_name(name: str) -> str: + return name.replace("/", "-") + + class IliasElementType(Enum): """ The type of an ilias element. @@ -260,7 +264,7 @@ class IliasCrawler: links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") for link in links: abs_url = self._abs_url_from_link(link) - element_path = Path(folder_path, link.getText().strip()) + element_path = Path(folder_path, _sanitize_path_name(link.getText().strip())) element_type = self._find_type_from_link(element_path, link, abs_url) if element_type == IliasElementType.REGULAR_FILE: @@ -377,7 +381,7 @@ class IliasCrawler: modification_date = demangle_date(modification_date_str) # Grab the name from the link text - name = link_element.getText() + name = _sanitize_path_name(link_element.getText()) full_path = Path(path, name + "." + file_type) return [ @@ -508,7 +512,7 @@ class IliasCrawler: ).getText().strip() title += ".mp4" - video_path: Path = Path(parent_path, title) + video_path: Path = Path(parent_path, _sanitize_path_name(title)) video_url = self._abs_url_from_link(link) @@ -580,6 +584,7 @@ class IliasCrawler: # Two divs, side by side. Left is the name, right is the link ==> get left # sibling file_name = file_link.parent.findPrevious(name="div").getText().strip() + file_name = _sanitize_path_name(file_name) url = self._abs_url_from_link(file_link) LOGGER.debug("Found file %r at %r", file_name, url) From 98834c9c951c5e7eff26986004ddc54059bcc785 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 12 Nov 2020 20:23:36 +0100 Subject: [PATCH 016/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e35f209..388f9a4 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.3 +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.4 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.3 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.3/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.4 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.4/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 8335f7f..05fe3c2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.4.3", + version="2.4.4", packages=find_packages(), install_requires=[ "requests>=2.21.0", From cd90a60dee340057b75196c01f70d79a504c6fe7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 12 Nov 2020 20:52:46 +0100 Subject: [PATCH 017/524] Move "sanitize_windows_path" to PFERD.transform --- PFERD/transform.py | 17 ++++++++++++++++- sync_url.py | 16 +++------------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/PFERD/transform.py b/PFERD/transform.py index 16769df..7a05dd1 100644 --- a/PFERD/transform.py +++ b/PFERD/transform.py @@ -5,6 +5,8 @@ only files whose names match a regex, or renaming files from one numbering scheme to another. """ +import os +import re from dataclasses import dataclass from pathlib import PurePath from typing import Callable, List, Optional, TypeVar @@ -45,7 +47,8 @@ def apply_transform( # Transform combinators -keep = lambda path: path +def keep(path: PurePath) -> Optional[PurePath]: + return path def attempt(*args: Transform) -> Transform: def inner(path: PurePath) -> Optional[PurePath]: @@ -125,3 +128,15 @@ def re_rename(regex: Regex, target: str) -> Transform: return path.with_name(target.format(*groups)) return None return inner + + +def sanitize_windows_path(path: PurePath) -> Optional[PurePath]: + """ + A small function to escape characters that are forbidden in windows path names. + This method is a no-op on other operating systems. + """ + # Escape windows illegal path characters + if os.name == 'nt': + sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] + return PurePath(*sanitized_parts) + return path diff --git a/sync_url.py b/sync_url.py index 2b8bc56..ddd239a 100755 --- a/sync_url.py +++ b/sync_url.py @@ -5,27 +5,17 @@ A simple script to download a course by name from ILIAS. """ import argparse -import os -import re -from pathlib import Path, PurePath -from typing import Optional +from pathlib import Path from urllib.parse import urlparse from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) +from PFERD.transform import sanitize_windows_path from PFERD.utils import to_path -def sanitize_path(path: PurePath) -> Optional[PurePath]: - # Escape windows illegal path characters - if os.name == 'nt': - sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] - return PurePath(*sanitized_parts) - return path - - def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--test-run", action="store_true") @@ -71,7 +61,7 @@ def main() -> None: full_url=args.url, cookies=args.cookies, dir_filter=dir_filter, - transform=sanitize_path + transform=sanitize_windows_path ) From 8ebf0eab169a72eb0f82e410a2391886e3a6aeb1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 17 Nov 2020 21:36:04 +0100 Subject: [PATCH 018/524] Sort download summary --- PFERD/download_summary.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/download_summary.py b/PFERD/download_summary.py index c8135de..3b9a024 100644 --- a/PFERD/download_summary.py +++ b/PFERD/download_summary.py @@ -5,6 +5,12 @@ from pathlib import Path from typing import List +def _mergeNoDuplicate(first: List[Path], second: List[Path]) -> List[Path]: + tmp = list(set(first + second)) + tmp.sort(key=lambda x: str(x.resolve())) + return tmp + + class DownloadSummary: """ Keeps track of all new, modified or deleted files and provides a summary. @@ -40,9 +46,9 @@ class DownloadSummary: """ Merges ourselves with the passed summary. Modifies this object, but not the passed one. """ - self._new_files = list(set(self._new_files + summary.new_files)) - self._modified_files = list(set(self._modified_files + summary.modified_files)) - self._deleted_files = list(set(self._deleted_files + summary.deleted_files)) + self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files) + self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files) + self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files) def add_deleted_file(self, path: Path) -> None: """ From ba9215ebe81e67940f88c52eb1a42b2dc480661b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 18 Nov 2020 10:09:45 +0100 Subject: [PATCH 019/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 388f9a4..3a877c1 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.4 +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.4 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.4/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.5/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 05fe3c2..9b226f8 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.4.4", + version="2.4.5", packages=find_packages(), install_requires=[ "requests>=2.21.0", From ba3c7f85fae0e046889a5579586281df83999ff8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 19 Nov 2020 19:37:28 +0100 Subject: [PATCH 020/524] Replace "\" in ILIAS paths as well I am not sure whether anybody really uses a backslash in their names, but I guess it can't hurt to do this for windows users. --- PFERD/ilias/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index f5b1ae8..2e37e36 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -27,7 +27,7 @@ PRETTY = PrettyLogger(LOGGER) def _sanitize_path_name(name: str) -> str: - return name.replace("/", "-") + return name.replace("/", "-").replace("\\", "-") class IliasElementType(Enum): From 9cbea5fe06b81f37ce50c871b127b17648493367 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 23 Nov 2020 10:16:40 +0100 Subject: [PATCH 021/524] Add requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f851c23 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.21.0 +beautifulsoup4>=4.7.1 +rich>=2.1.0 From ecdbca8fb6f40aed2ffce34b6aeb69643e35edd3 Mon Sep 17 00:00:00 2001 From: Christophe Date: Wed, 2 Dec 2020 16:50:30 +0100 Subject: [PATCH 022/524] Make sync_url work relative to cwd like sane programs --- sync_url.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sync_url.py b/sync_url.py index ddd239a..c327b9d 100755 --- a/sync_url.py +++ b/sync_url.py @@ -35,19 +35,15 @@ def main() -> None: cookie_jar.load_cookies() - if args.folder is not None: - folder = args.folder - # Initialize pferd at the *parent of the passed folder* - # This is needed so Pferd's internal protections against escaping the working directory - # do not trigger (e.g. if somebody names a file in ILIAS '../../bad thing.txt') - pferd = Pferd(Path(Path(__file__).parent, folder).parent, test_run=args.test_run) - else: - # fetch course name from ilias + folder = args.folder + if args.folder is None: folder = crawler.find_element_name(args.url) cookie_jar.save_cookies() - # Initialize pferd at the location of the script - pferd = Pferd(Path(__file__).parent, test_run=args.test_run) + # files may not escape the pferd_root with relative paths + # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path + pferd_root = Path(Path.cwd(), Path(folder)).parent + pferd = Pferd(pferd_root, test_run=args.test_run) def dir_filter(_: Path, element: IliasElementType) -> bool: if args.no_videos: From f3a46634913d4b9d7c21389195d8b10eb5488017 Mon Sep 17 00:00:00 2001 From: Christophe Date: Wed, 2 Dec 2020 16:58:36 +0100 Subject: [PATCH 023/524] Add passive/no_prompt flag --- PFERD/organizer.py | 17 +++++++++++++---- PFERD/pferd.py | 7 +++++-- sync_url.py | 5 ++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 87bc684..346df76 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -26,7 +26,7 @@ class FileAcceptException(Exception): class Organizer(Location): """A helper for managing downloaded files.""" - def __init__(self, path: Path): + def __init__(self, path: Path, no_prompt: bool = False): """Create a new organizer for a given path.""" super().__init__(path) self._known_files: Set[Path] = set() @@ -36,6 +36,8 @@ class Organizer(Location): self.download_summary = DownloadSummary() + self.not_prompting = no_prompt + def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: """ Move a file to this organizer and mark it. @@ -67,13 +69,18 @@ class Organizer(Location): if self._is_marked(dst): PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") - if not prompt_yes_no(f"Overwrite file?", default=False): + default_action: bool = False + if self.not_prompting and not default_action \ + or not self.not_prompting and not prompt_yes_no(f"Overwrite file?", default=default_action): PRETTY.ignored_file(dst_absolute, "file was written previously") return None # Destination file is directory if dst_absolute.exists() and dst_absolute.is_dir(): - if prompt_yes_no(f"Overwrite folder {dst_absolute} with file?", default=False): + default_action: bool = False + if self.not_prompting and default_action \ + or not self.not_prompting \ + and prompt_yes_no(f"Overwrite folder {dst_absolute} with file?", default=default_action): shutil.rmtree(dst_absolute) else: PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") @@ -144,6 +151,8 @@ class Organizer(Location): def _delete_file_if_confirmed(self, path: Path) -> None: prompt = f"Do you want to delete {path}" - if prompt_yes_no(prompt, False): + default_action: bool = False + if self.not_prompting and default_action or \ + not self.not_prompting and prompt_yes_no(prompt, default_action): self.download_summary.add_deleted_file(path) path.unlink() diff --git a/PFERD/pferd.py b/PFERD/pferd.py index c01b5fd..57b15f6 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -76,12 +76,13 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy, timeout: int, clean: bool = True, + no_prompt: bool = None ) -> Organizer: # pylint: disable=too-many-locals cookie_jar = CookieJar(to_path(cookies) if cookies else None) session = cookie_jar.create_session() tmp_dir = self._tmp_dir.new_subdir() - organizer = Organizer(self.resolve(to_path(target))) + organizer = Organizer(self.resolve(to_path(target)), no_prompt if no_prompt is not None else False) crawler = IliasCrawler(base_url, session, authenticator, dir_filter) downloader = IliasDownloader(tmp_dir, organizer, session, @@ -245,6 +246,7 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy = download_modified_or_new, clean: bool = True, timeout: int = 5, + no_prompt: bool = None ) -> Organizer: """ Synchronizes a folder with a given folder on the ILIAS instance of the KIT. @@ -289,7 +291,8 @@ class Pferd(Location): transform=transform, download_strategy=download_strategy, clean=clean, - timeout=timeout + timeout=timeout, + no_prompt=no_prompt ) self._download_summary.merge(organizer.download_summary) diff --git a/sync_url.py b/sync_url.py index c327b9d..c2ffb93 100755 --- a/sync_url.py +++ b/sync_url.py @@ -21,6 +21,8 @@ def main() -> None: parser.add_argument("--test-run", action="store_true") parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") + parser.add_argument('-p', '--passive', action="store_true", + help="Don't prompt for confirmations and use sane defaults") parser.add_argument('url', help="URL to the course page") parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") args = parser.parse_args() @@ -57,7 +59,8 @@ def main() -> None: full_url=args.url, cookies=args.cookies, dir_filter=dir_filter, - transform=sanitize_windows_path + transform=sanitize_windows_path, + no_prompt=args.passive ) From 49a0ca7a7c149399c93014a0de9769fd20e050bf Mon Sep 17 00:00:00 2001 From: Christophe Date: Wed, 2 Dec 2020 16:59:29 +0100 Subject: [PATCH 024/524] Add myself to LICENSE This should've been done back when I added a PR for adding sync_url but people are lazy smh. --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 2e3fa8c..26bcc0a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw +Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in From 6426060804e0212795b59adff85dbced8cff4b5b Mon Sep 17 00:00:00 2001 From: Christophe Date: Wed, 2 Dec 2020 18:40:45 +0100 Subject: [PATCH 025/524] Fix relative paths bug Introduced in 74ea03945876c94c260b590e6140a7ee50630477 --- sync_url.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sync_url.py b/sync_url.py index c2ffb93..14c2c9e 100755 --- a/sync_url.py +++ b/sync_url.py @@ -37,14 +37,15 @@ def main() -> None: cookie_jar.load_cookies() - folder = args.folder + folder = Path(args.folder) if args.folder is None: - folder = crawler.find_element_name(args.url) + folder = Path(crawler.find_element_name(args.url)) cookie_jar.save_cookies() # files may not escape the pferd_root with relative paths # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path pferd_root = Path(Path.cwd(), Path(folder)).parent + folder = folder.name pferd = Pferd(pferd_root, test_run=args.test_run) def dir_filter(_: Path, element: IliasElementType) -> bool: From 9f6dc56a7b88104a726af4059a2f709209ce54ea Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 2 Dec 2020 19:29:52 +0100 Subject: [PATCH 026/524] Use a strategy to decide conflict resolution --- PFERD/organizer.py | 54 +++++++++++++++++++++++++++++++++++----------- PFERD/pferd.py | 38 ++++++++++++++++++++++---------- sync_url.py | 34 +++++++++++++++++++++++------ 3 files changed, 96 insertions(+), 30 deletions(-) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 346df76..f63e92a 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -7,8 +7,9 @@ import filecmp import logging import os import shutil +from enum import Enum from pathlib import Path, PurePath -from typing import List, Optional, Set +from typing import Callable, List, Optional, Set from .download_summary import DownloadSummary from .location import Location @@ -19,6 +20,25 @@ LOGGER = logging.getLogger(__name__) PRETTY = PrettyLogger(LOGGER) +class FileConflictResolution(Enum): + """ + The reaction when confronted with a file conflict. + """ + + OVERWRITE_EXISTING = "overwrite" + KEEP_EXISTING = "keep" + DEFAULT = "default" + PROMPT = "prompt" + + +FileConflictResolver = Callable[[PurePath], FileConflictResolution] + + +def resolve_prompt_user(_path: PurePath) -> FileConflictResolution: + """Resolves conflicts by always asking the user.""" + return FileConflictResolution.PROMPT + + class FileAcceptException(Exception): """An exception while accepting a file.""" @@ -26,7 +46,7 @@ class FileAcceptException(Exception): class Organizer(Location): """A helper for managing downloaded files.""" - def __init__(self, path: Path, no_prompt: bool = False): + def __init__(self, path: Path, conflict_resolver: FileConflictResolver = resolve_prompt_user): """Create a new organizer for a given path.""" super().__init__(path) self._known_files: Set[Path] = set() @@ -36,7 +56,7 @@ class Organizer(Location): self.download_summary = DownloadSummary() - self.not_prompting = no_prompt + self.conflict_resolver = conflict_resolver def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: """ @@ -69,18 +89,14 @@ class Organizer(Location): if self._is_marked(dst): PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") - default_action: bool = False - if self.not_prompting and not default_action \ - or not self.not_prompting and not prompt_yes_no(f"Overwrite file?", default=default_action): + if self._resolve_conflict(f"Overwrite file?", dst_absolute, default=False): PRETTY.ignored_file(dst_absolute, "file was written previously") return None # Destination file is directory if dst_absolute.exists() and dst_absolute.is_dir(): - default_action: bool = False - if self.not_prompting and default_action \ - or not self.not_prompting \ - and prompt_yes_no(f"Overwrite folder {dst_absolute} with file?", default=default_action): + prompt = f"Overwrite folder {dst_absolute} with file?" + if self._resolve_conflict(prompt, dst_absolute, default=False): shutil.rmtree(dst_absolute) else: PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") @@ -151,8 +167,20 @@ class Organizer(Location): def _delete_file_if_confirmed(self, path: Path) -> None: prompt = f"Do you want to delete {path}" - default_action: bool = False - if self.not_prompting and default_action or \ - not self.not_prompting and prompt_yes_no(prompt, default_action): + if self._resolve_conflict(prompt, path, default=False): self.download_summary.add_deleted_file(path) path.unlink() + + def _resolve_conflict(self, prompt: str, path: Path, default: bool) -> bool: + if not self.conflict_resolver: + return prompt_yes_no(prompt, default=default) + + result = self.conflict_resolver(path) + if result == FileConflictResolution.DEFAULT: + return default + if result == FileConflictResolution.KEEP_EXISTING: + return False + if result == FileConflictResolution.OVERWRITE_EXISTING: + return True + + return prompt_yes_no(prompt, default=default) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 57b15f6..12ead8b 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -18,7 +18,7 @@ from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo, IpdDownloadStrategy, ipd_download_new_or_modified) from .location import Location from .logging import PrettyLogger, enable_logging -from .organizer import Organizer +from .organizer import FileConflictResolver, Organizer, resolve_prompt_user from .tmp_dir import TmpDir from .transform import TF, Transform, apply_transform from .utils import PathLike, to_path @@ -76,13 +76,13 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy, timeout: int, clean: bool = True, - no_prompt: bool = None + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: # pylint: disable=too-many-locals cookie_jar = CookieJar(to_path(cookies) if cookies else None) session = cookie_jar.create_session() tmp_dir = self._tmp_dir.new_subdir() - organizer = Organizer(self.resolve(to_path(target)), no_prompt if no_prompt is not None else False) + organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) crawler = IliasCrawler(base_url, session, authenticator, dir_filter) downloader = IliasDownloader(tmp_dir, organizer, session, @@ -118,6 +118,7 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy = download_modified_or_new, clean: bool = True, timeout: int = 5, + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: """ Synchronizes a folder with the ILIAS instance of the KIT. @@ -145,6 +146,8 @@ class Pferd(Location): clean {bool} -- Whether to clean up when the method finishes. timeout {int} -- The download timeout for opencast videos. Sadly needed due to a requests bug. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. authenticator = KitShibbolethAuthenticator(username=username, password=password) @@ -160,7 +163,8 @@ class Pferd(Location): transform=transform, download_strategy=download_strategy, clean=clean, - timeout=timeout + timeout=timeout, + file_conflict_resolver=file_conflict_resolver ) self._download_summary.merge(organizer.download_summary) @@ -185,6 +189,7 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy = download_modified_or_new, clean: bool = True, timeout: int = 5, + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: """ Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS @@ -211,6 +216,8 @@ class Pferd(Location): clean {bool} -- Whether to clean up when the method finishes. timeout {int} -- The download timeout for opencast videos. Sadly needed due to a requests bug. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. authenticator = KitShibbolethAuthenticator(username=username, password=password) @@ -226,7 +233,8 @@ class Pferd(Location): transform=transform, download_strategy=download_strategy, clean=clean, - timeout=timeout + timeout=timeout, + file_conflict_resolver=file_conflict_resolver ) self._download_summary.merge(organizer.download_summary) @@ -246,7 +254,7 @@ class Pferd(Location): download_strategy: IliasDownloadStrategy = download_modified_or_new, clean: bool = True, timeout: int = 5, - no_prompt: bool = None + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: """ Synchronizes a folder with a given folder on the ILIAS instance of the KIT. @@ -273,6 +281,8 @@ class Pferd(Location): clean {bool} -- Whether to clean up when the method finishes. timeout {int} -- The download timeout for opencast videos. Sadly needed due to a requests bug. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. authenticator = KitShibbolethAuthenticator(username=username, password=password) @@ -292,7 +302,7 @@ class Pferd(Location): download_strategy=download_strategy, clean=clean, timeout=timeout, - no_prompt=no_prompt + file_conflict_resolver=file_conflict_resolver ) self._download_summary.merge(organizer.download_summary) @@ -306,7 +316,8 @@ class Pferd(Location): url: str, transform: Transform = lambda x: x, download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified, - clean: bool = True + clean: bool = True, + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: """ Synchronizes a folder with a DIVA playlist. @@ -322,6 +333,8 @@ class Pferd(Location): be downloaded. Can save bandwidth and reduce the number of requests. (default: {diva_download_new}) clean {bool} -- Whether to clean up when the method finishes. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. """ tmp_dir = self._tmp_dir.new_subdir() @@ -332,7 +345,7 @@ class Pferd(Location): if isinstance(target, Organizer): organizer = target else: - organizer = Organizer(self.resolve(to_path(target))) + organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) PRETTY.starting_synchronizer(organizer.path, "IPD", url) @@ -360,7 +373,8 @@ class Pferd(Location): playlist_location: str, transform: Transform = lambda x: x, download_strategy: DivaDownloadStrategy = diva_download_new, - clean: bool = True + clean: bool = True, + file_conflict_resolver: FileConflictResolver = resolve_prompt_user ) -> Organizer: """ Synchronizes a folder with a DIVA playlist. @@ -377,6 +391,8 @@ class Pferd(Location): be downloaded. Can save bandwidth and reduce the number of requests. (default: {diva_download_new}) clean {bool} -- Whether to clean up when the method finishes. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. """ tmp_dir = self._tmp_dir.new_subdir() @@ -392,7 +408,7 @@ class Pferd(Location): if isinstance(target, Organizer): organizer = target else: - organizer = Organizer(self.resolve(to_path(target))) + organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) PRETTY.starting_synchronizer(organizer.path, "DIVA", playlist_id) diff --git a/sync_url.py b/sync_url.py index 14c2c9e..e06deb6 100755 --- a/sync_url.py +++ b/sync_url.py @@ -5,24 +5,35 @@ A simple script to download a course by name from ILIAS. """ import argparse -from pathlib import Path +from pathlib import Path, PurePath from urllib.parse import urlparse from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) +from PFERD.organizer import FileConflictResolution, resolve_prompt_user from PFERD.transform import sanitize_windows_path from PFERD.utils import to_path +def _resolve_overwrite(_path: PurePath) -> FileConflictResolution: + return FileConflictResolution.OVERWRITE_EXISTING + + +def _resolve_default(_path: PurePath) -> FileConflictResolution: + return FileConflictResolution.DEFAULT + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--test-run", action="store_true") parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") - parser.add_argument('-p', '--passive', action="store_true", + parser.add_argument('-d', '--default', action="store_true", help="Don't prompt for confirmations and use sane defaults") + parser.add_argument('-r', '--remove', action="store_true", + help="Remove and overwrite files without prompting for confirmation") parser.add_argument('url', help="URL to the course page") parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") args = parser.parse_args() @@ -39,13 +50,17 @@ def main() -> None: folder = Path(args.folder) if args.folder is None: - folder = Path(crawler.find_element_name(args.url)) + element_name = crawler.find_element_name(args.url) + if not element_name: + print("Error, could not get element name. Please specify a folder yourself.") + return + folder = Path(element_name) cookie_jar.save_cookies() # files may not escape the pferd_root with relative paths # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path pferd_root = Path(Path.cwd(), Path(folder)).parent - folder = folder.name + target = folder.name pferd = Pferd(pferd_root, test_run=args.test_run) def dir_filter(_: Path, element: IliasElementType) -> bool: @@ -53,15 +68,22 @@ def main() -> None: return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] return True + if args.default: + file_confilict_resolver = _resolve_default + elif args.remove: + file_confilict_resolver = _resolve_overwrite + else: + file_confilict_resolver = resolve_prompt_user + pferd.enable_logging() # fetch pferd.ilias_kit_folder( - target=folder, + target=target, full_url=args.url, cookies=args.cookies, dir_filter=dir_filter, transform=sanitize_windows_path, - no_prompt=args.passive + file_conflict_resolver=file_confilict_resolver ) From fcb3884a8fcd0c8cef6f79b0083c49b45afb4ff9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 13:47:53 +0100 Subject: [PATCH 027/524] Add --remote-first, --local-first and --no-delete flags --- PFERD/organizer.py | 40 ++++++++++++++++++++++++++++++---------- sync_url.py | 39 ++++++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index f63e92a..a41d0d2 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -20,21 +20,37 @@ LOGGER = logging.getLogger(__name__) PRETTY = PrettyLogger(LOGGER) +class ConflictType(Enum): + """ + The type of the conflict. A file might not exist anymore and will be deleted + or it might be overwritten with a newer version. + """ + FILE_OVERWRITTEN = "overwritten" + FILE_DELETED = "deleted" + + class FileConflictResolution(Enum): """ - The reaction when confronted with a file conflict. + The reaction when confronted with a file conflict: """ - OVERWRITE_EXISTING = "overwrite" + DESTROY_EXISTING = "destroy" + """Delete/overwrite the current file""" + KEEP_EXISTING = "keep" + """Keep the current file""" + DEFAULT = "default" + """Do whatever the PFERD authors thought is sensible""" + PROMPT = "prompt" + """Interactively ask the user""" -FileConflictResolver = Callable[[PurePath], FileConflictResolution] +FileConflictResolver = Callable[[PurePath, ConflictType], FileConflictResolution] -def resolve_prompt_user(_path: PurePath) -> FileConflictResolution: +def resolve_prompt_user(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: """Resolves conflicts by always asking the user.""" return FileConflictResolution.PROMPT @@ -89,14 +105,16 @@ class Organizer(Location): if self._is_marked(dst): PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") - if self._resolve_conflict(f"Overwrite file?", dst_absolute, default=False): + conflict = ConflictType.FILE_OVERWRITTEN + if self._resolve_conflict(f"Overwrite file?", dst_absolute, conflict, default=False): PRETTY.ignored_file(dst_absolute, "file was written previously") return None # Destination file is directory if dst_absolute.exists() and dst_absolute.is_dir(): prompt = f"Overwrite folder {dst_absolute} with file?" - if self._resolve_conflict(prompt, dst_absolute, default=False): + conflict = ConflictType.FILE_OVERWRITTEN + if self._resolve_conflict(prompt, dst_absolute, conflict, default=False): shutil.rmtree(dst_absolute) else: PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") @@ -167,20 +185,22 @@ class Organizer(Location): def _delete_file_if_confirmed(self, path: Path) -> None: prompt = f"Do you want to delete {path}" - if self._resolve_conflict(prompt, path, default=False): + if self._resolve_conflict(prompt, path, ConflictType.FILE_DELETED, default=False): self.download_summary.add_deleted_file(path) path.unlink() - def _resolve_conflict(self, prompt: str, path: Path, default: bool) -> bool: + def _resolve_conflict( + self, prompt: str, path: Path, conflict: ConflictType, default: bool + ) -> bool: if not self.conflict_resolver: return prompt_yes_no(prompt, default=default) - result = self.conflict_resolver(path) + result = self.conflict_resolver(path, conflict) if result == FileConflictResolution.DEFAULT: return default if result == FileConflictResolution.KEEP_EXISTING: return False - if result == FileConflictResolution.OVERWRITE_EXISTING: + if result == FileConflictResolution.DESTROY_EXISTING: return True return prompt_yes_no(prompt, default=default) diff --git a/sync_url.py b/sync_url.py index e06deb6..91a7521 100755 --- a/sync_url.py +++ b/sync_url.py @@ -12,17 +12,26 @@ from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) -from PFERD.organizer import FileConflictResolution, resolve_prompt_user +from PFERD.organizer import (ConflictType, FileConflictResolution, + FileConflictResolver, resolve_prompt_user) from PFERD.transform import sanitize_windows_path from PFERD.utils import to_path -def _resolve_overwrite(_path: PurePath) -> FileConflictResolution: - return FileConflictResolution.OVERWRITE_EXISTING +def _resolve_remote_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: + return FileConflictResolution.DESTROY_EXISTING -def _resolve_default(_path: PurePath) -> FileConflictResolution: - return FileConflictResolution.DEFAULT +def _resolve_local_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: + return FileConflictResolution.KEEP_EXISTING + + +def _resolve_no_delete(_path: PurePath, conflict: ConflictType) -> FileConflictResolution: + # Update files + if conflict == ConflictType.FILE_OVERWRITTEN: + return FileConflictResolution.DESTROY_EXISTING + # But do not delete them + return FileConflictResolution.KEEP_EXISTING def main() -> None: @@ -30,10 +39,12 @@ def main() -> None: parser.add_argument("--test-run", action="store_true") parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") - parser.add_argument('-d', '--default', action="store_true", - help="Don't prompt for confirmations and use sane defaults") - parser.add_argument('-r', '--remove', action="store_true", - help="Remove and overwrite files without prompting for confirmation") + parser.add_argument('--local-first', action="store_true", + help="Don't prompt for confirmation, keep existing files") + parser.add_argument('--remote-first', action="store_true", + help="Don't prompt for confirmation, delete and overwrite local files") + parser.add_argument('--no-delete', action="store_true", + help="Don't prompt for confirmation, overwrite local files, don't delete") parser.add_argument('url', help="URL to the course page") parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") args = parser.parse_args() @@ -68,10 +79,12 @@ def main() -> None: return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] return True - if args.default: - file_confilict_resolver = _resolve_default - elif args.remove: - file_confilict_resolver = _resolve_overwrite + if args.remote_first: + file_confilict_resolver: FileConflictResolver = _resolve_remote_first + elif args.local_first: + file_confilict_resolver = _resolve_local_first + elif args.no_delete: + file_confilict_resolver = _resolve_no_delete else: file_confilict_resolver = resolve_prompt_user From 2d644095429e39b6b307c50a7a22fb0e0a1ee1eb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 13:50:46 +0100 Subject: [PATCH 028/524] Fix handling of empty args.folder --- sync_url.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index 91a7521..97c0c81 100755 --- a/sync_url.py +++ b/sync_url.py @@ -59,7 +59,6 @@ def main() -> None: cookie_jar.load_cookies() - folder = Path(args.folder) if args.folder is None: element_name = crawler.find_element_name(args.url) if not element_name: @@ -67,6 +66,8 @@ def main() -> None: return folder = Path(element_name) cookie_jar.save_cookies() + else: + folder = Path(args.folder) # files may not escape the pferd_root with relative paths # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path From 4ce385b262daa1064002d000ea75ea9f705c151e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 14:03:43 +0100 Subject: [PATCH 029/524] Treat file overwrite and marked file overwrite differently --- PFERD/organizer.py | 29 ++++++++++++++++++++++------- sync_url.py | 2 ++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index a41d0d2..1038ae7 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -24,34 +24,44 @@ class ConflictType(Enum): """ The type of the conflict. A file might not exist anymore and will be deleted or it might be overwritten with a newer version. + + FILE_OVERWRITTEN: An existing file will be updated + MARKED_FILE_OVERWRITTEN: A file is written for the second+ time in this run + FILE_DELETED: The file was deleted """ FILE_OVERWRITTEN = "overwritten" + MARKED_FILE_OVERWRITTEN = "marked_file_overwritten" FILE_DELETED = "deleted" class FileConflictResolution(Enum): """ The reaction when confronted with a file conflict: + + DESTROY_EXISTING: Delete/overwrite the current file + KEEP_EXISTING: Keep the current file + DEFAULT: Do whatever the PFERD authors thought is sensible + PROMPT: Interactively ask the user """ DESTROY_EXISTING = "destroy" - """Delete/overwrite the current file""" KEEP_EXISTING = "keep" - """Keep the current file""" DEFAULT = "default" - """Do whatever the PFERD authors thought is sensible""" PROMPT = "prompt" - """Interactively ask the user""" FileConflictResolver = Callable[[PurePath, ConflictType], FileConflictResolution] -def resolve_prompt_user(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: - """Resolves conflicts by always asking the user.""" +def resolve_prompt_user(_path: PurePath, conflict: ConflictType) -> FileConflictResolution: + """ + Resolves conflicts by asking the user if a file was written twice or will be deleted. + """ + if conflict == ConflictType.FILE_OVERWRITTEN: + return FileConflictResolution.DESTROY_EXISTING return FileConflictResolution.PROMPT @@ -105,7 +115,7 @@ class Organizer(Location): if self._is_marked(dst): PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") - conflict = ConflictType.FILE_OVERWRITTEN + conflict = ConflictType.MARKED_FILE_OVERWRITTEN if self._resolve_conflict(f"Overwrite file?", dst_absolute, conflict, default=False): PRETTY.ignored_file(dst_absolute, "file was written previously") return None @@ -128,6 +138,11 @@ class Organizer(Location): self.mark(dst) return dst_absolute + prompt = f"Overwrite file {dst_absolute}?" + conflict = ConflictType.FILE_OVERWRITTEN + if not self._resolve_conflict(prompt, dst_absolute, conflict, default=True): + return None + self.download_summary.add_modified_file(dst_absolute) PRETTY.modified_file(dst_absolute) else: diff --git a/sync_url.py b/sync_url.py index 97c0c81..c6231e4 100755 --- a/sync_url.py +++ b/sync_url.py @@ -30,6 +30,8 @@ def _resolve_no_delete(_path: PurePath, conflict: ConflictType) -> FileConflictR # Update files if conflict == ConflictType.FILE_OVERWRITTEN: return FileConflictResolution.DESTROY_EXISTING + if conflict == ConflictType.MARKED_FILE_OVERWRITTEN: + return FileConflictResolution.DESTROY_EXISTING # But do not delete them return FileConflictResolution.KEEP_EXISTING From 57259e21f462b56826acf4b041d9005be4c26365 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 14:08:00 +0100 Subject: [PATCH 030/524] Print download summary in sync_url --- sync_url.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sync_url.py b/sync_url.py index c6231e4..6629a18 100755 --- a/sync_url.py +++ b/sync_url.py @@ -102,6 +102,8 @@ def main() -> None: file_conflict_resolver=file_confilict_resolver ) + pferd.print_summary() + if __name__ == "__main__": main() From 0f5e55648be99b73fbe349ecc6a97b110d8dbe66 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 14:11:51 +0100 Subject: [PATCH 031/524] Tell user when the conflict resolver kept existing files --- PFERD/organizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PFERD/organizer.py b/PFERD/organizer.py index 1038ae7..a15e751 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -141,6 +141,7 @@ class Organizer(Location): prompt = f"Overwrite file {dst_absolute}?" conflict = ConflictType.FILE_OVERWRITTEN if not self._resolve_conflict(prompt, dst_absolute, conflict, default=True): + PRETTY.ignored_file(dst_absolute, "user conflict resolution") return None self.download_summary.add_modified_file(dst_absolute) @@ -203,6 +204,8 @@ class Organizer(Location): if self._resolve_conflict(prompt, path, ConflictType.FILE_DELETED, default=False): self.download_summary.add_deleted_file(path) path.unlink() + else: + PRETTY.ignored_file(path, "user conflict resolution") def _resolve_conflict( self, prompt: str, path: Path, conflict: ConflictType, default: bool From 1e0343bba667ca47e7470e5d2d5aca84d832bf9a Mon Sep 17 00:00:00 2001 From: Lucas <24826124+Luro02@users.noreply.github.com> Date: Tue, 3 Nov 2020 13:38:33 +0100 Subject: [PATCH 032/524] sync_url: Add username and password args --- sync_url.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index 6629a18..e4c4c9a 100755 --- a/sync_url.py +++ b/sync_url.py @@ -40,6 +40,8 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--test-run", action="store_true") parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") + parser.add_argument('-u', '--username', nargs='?', default=None, help="Username for Ilias") + parser.add_argument('-p', '--password', nargs='?', default=None, help="Password for Ilias") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") parser.add_argument('--local-first', action="store_true", help="Don't prompt for confirmation, keep existing files") @@ -55,7 +57,7 @@ def main() -> None: cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) session = cookie_jar.create_session() - authenticator = KitShibbolethAuthenticator() + authenticator = KitShibbolethAuthenticator(username=args.username, password=args.password) crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, authenticator, lambda x, y: True) @@ -98,6 +100,8 @@ def main() -> None: full_url=args.url, cookies=args.cookies, dir_filter=dir_filter, + username=args.username, + password=args.password, transform=sanitize_windows_path, file_conflict_resolver=file_confilict_resolver ) From 75471c46d1a62447c38763517479cb31be4949da Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 Nov 2020 21:18:48 +0100 Subject: [PATCH 033/524] Use credential file --- sync_url.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/sync_url.py b/sync_url.py index e4c4c9a..beba144 100755 --- a/sync_url.py +++ b/sync_url.py @@ -5,18 +5,42 @@ A simple script to download a course by name from ILIAS. """ import argparse +import logging +import sys from pathlib import Path, PurePath +from typing import Optional, Tuple from urllib.parse import urlparse from PFERD import Pferd from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) +from PFERD.logging import PrettyLogger, enable_logging from PFERD.organizer import (ConflictType, FileConflictResolution, FileConflictResolver, resolve_prompt_user) from PFERD.transform import sanitize_windows_path from PFERD.utils import to_path +_LOGGER = logging.getLogger("sync_url") +_PRETTY = PrettyLogger(_LOGGER) + + +def _extract_credentials(file_path: Optional[str]) -> Tuple[Optional[str], Optional[str]]: + if not file_path: + return (None, None) + + if not Path(file_path).exists(): + _PRETTY.error("Credential file does not exist") + sys.exit(1) + + with open(file_path, "r") as file: + first_line = file.read().splitlines()[0] + read_name, *read_password = first_line.split(":", 1) + + name = read_name if read_name else None + password = read_password[0] if read_password else None + return (name, password) + def _resolve_remote_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: return FileConflictResolution.DESTROY_EXISTING @@ -37,11 +61,16 @@ def _resolve_no_delete(_path: PurePath, conflict: ConflictType) -> FileConflictR def main() -> None: + enable_logging(name="sync_url") + parser = argparse.ArgumentParser() parser.add_argument("--test-run", action="store_true") parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") parser.add_argument('-u', '--username', nargs='?', default=None, help="Username for Ilias") parser.add_argument('-p', '--password', nargs='?', default=None, help="Password for Ilias") + parser.add_argument('--credential-file', nargs='?', default=None, + help="Path to a file containing credentials for Ilias. The file must have " + "one line in the following format: ':'") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") parser.add_argument('--local-first', action="store_true", help="Don't prompt for confirmation, keep existing files") @@ -53,11 +82,13 @@ def main() -> None: parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") args = parser.parse_args() - url = urlparse(args.url) - cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) session = cookie_jar.create_session() - authenticator = KitShibbolethAuthenticator(username=args.username, password=args.password) + + username, password = _extract_credentials(args.credential_file) + authenticator = KitShibbolethAuthenticator(username=username, password=password) + + url = urlparse(args.url) crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, authenticator, lambda x, y: True) @@ -100,10 +131,10 @@ def main() -> None: full_url=args.url, cookies=args.cookies, dir_filter=dir_filter, - username=args.username, - password=args.password, - transform=sanitize_windows_path, - file_conflict_resolver=file_confilict_resolver + username=username, + password=password, + file_conflict_resolver=file_confilict_resolver, + transform=sanitize_windows_path ) pferd.print_summary() From 83ea15ee83e42cbdbf23a0a56e82d142c38ff636 Mon Sep 17 00:00:00 2001 From: Scriptim Date: Wed, 4 Nov 2020 00:18:27 +0100 Subject: [PATCH 034/524] Use system keyring service for password auth --- LICENSE | 4 ++ PFERD/authenticators.py | 89 +++++++++++++++++++++++++++++++++++ PFERD/ilias/__init__.py | 3 +- PFERD/ilias/authenticators.py | 8 +++- requirements.txt | 1 + setup.py | 3 +- sync_url.py | 27 ++++++++--- 7 files changed, 125 insertions(+), 10 deletions(-) diff --git a/LICENSE b/LICENSE index 26bcc0a..7e4f54e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,8 @@ +<<<<<<< HEAD Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe +======= +Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, Scriptim +>>>>>>> f89226c (Use system keyring service for password auth) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/authenticators.py b/PFERD/authenticators.py index b8cfe28..5537cc1 100644 --- a/PFERD/authenticators.py +++ b/PFERD/authenticators.py @@ -3,8 +3,19 @@ General authenticators useful in many situations """ import getpass +import logging from typing import Optional, Tuple +from .logging import PrettyLogger + +LOGGER = logging.getLogger(__name__) +PRETTY = PrettyLogger(LOGGER) + +try: + import keyring +except ImportError: + PRETTY.warning("Keyring module not found, KeyringAuthenticator won't work!") + class TfaAuthenticator: # pylint: disable=too-few-public-methods @@ -123,3 +134,81 @@ class UserPassAuthenticator: if self._given_username is not None and self._given_password is not None: self._given_username = None self._given_password = None + + +class KeyringAuthenticator(UserPassAuthenticator): + """ + An authenticator for username-password combinations that stores the + password using the system keyring service and prompts the user for missing + information. + """ + + def get_credentials(self) -> Tuple[str, str]: + """ + Returns a tuple (username, password). Prompts user for username or + password when necessary. + """ + + if self._username is None and self._given_username is not None: + self._username = self._given_username + + if self._password is None and self._given_password is not None: + self._password = self._given_password + + if self._username is not None and self._password is None: + self._load_password() + + if self._username is None or self._password is None: + print(f"Enter credentials ({self._reason})") + + username: str + if self._username is None: + username = input("Username: ") + self._username = username + else: + username = self._username + + if self._password is None: + self._load_password() + + password: str + if self._password is None: + password = getpass.getpass(prompt="Password: ") + self._password = password + self._save_password() + else: + password = self._password + + return (username, password) + + def _load_password(self) -> None: + """ + Loads the saved password associated with self._username from the system + keyring service (or None if not password has been saved yet) and stores + it in self._password. + """ + self._password = keyring.get_password("pferd-ilias", self._username) + + def _save_password(self) -> None: + """ + Saves self._password to the system keyring service and associates it + with self._username. + """ + keyring.set_password("pferd-ilias", self._username, self._password) + + def invalidate_credentials(self) -> None: + """ + Marks the credentials as invalid. If only a username was supplied in + the constructor, assumes that the username is valid and only the + password is invalid. If only a password was supplied in the + constructor, assumes that the password is valid and only the username + is invalid. Otherwise, assumes that username and password are both + invalid. + """ + + try: + keyring.delete_password("pferd-ilias", self._username) + except keyring.errors.PasswordDeleteError: + pass + + super().invalidate_credentials() diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py index 0a5f08b..379d244 100644 --- a/PFERD/ilias/__init__.py +++ b/PFERD/ilias/__init__.py @@ -2,7 +2,8 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/). """ -from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator +from .authenticators import (IliasAuthenticator, KitShibbolethAuthenticator, + KeyringKitShibbolethAuthenticator) from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, IliasElementType) from .downloader import (IliasDownloader, IliasDownloadInfo, diff --git a/PFERD/ilias/authenticators.py b/PFERD/ilias/authenticators.py index 763ed38..e70f459 100644 --- a/PFERD/ilias/authenticators.py +++ b/PFERD/ilias/authenticators.py @@ -37,8 +37,12 @@ class KitShibbolethAuthenticator(IliasAuthenticator): Authenticate via KIT's shibboleth system. """ - def __init__(self, username: Optional[str] = None, password: Optional[str] = None) -> None: - self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth", username, password) + def __init__(self, authenticator: Optional[UserPassAuthenticator] = None) -> None: + if authenticator: + self._auth = authenticator + else: + self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth") + self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") def authenticate(self, sess: requests.Session) -> None: diff --git a/requirements.txt b/requirements.txt index f851c23..2d852e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests>=2.21.0 beautifulsoup4>=4.7.1 rich>=2.1.0 +keyring>=21.5.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 9b226f8..6650016 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,8 @@ setup( install_requires=[ "requests>=2.21.0", "beautifulsoup4>=4.7.1", - "rich>=2.1.0" + "rich>=2.1.0", + "keyring>=21.5.0" ], ) diff --git a/sync_url.py b/sync_url.py index beba144..fe0b3c4 100755 --- a/sync_url.py +++ b/sync_url.py @@ -8,10 +8,11 @@ import argparse import logging import sys from pathlib import Path, PurePath -from typing import Optional, Tuple +from typing import Optional from urllib.parse import urlparse from PFERD import Pferd +from PFERD.authenticators import KeyringAuthenticator, UserPassAuthenticator from PFERD.cookie_jar import CookieJar from PFERD.ilias import (IliasCrawler, IliasElementType, KitShibbolethAuthenticator) @@ -25,9 +26,9 @@ _LOGGER = logging.getLogger("sync_url") _PRETTY = PrettyLogger(_LOGGER) -def _extract_credentials(file_path: Optional[str]) -> Tuple[Optional[str], Optional[str]]: +def _extract_credentials(file_path: Optional[str]) -> UserPassAuthenticator: if not file_path: - return (None, None) + return UserPassAuthenticator("KIT ILIAS Shibboleth", None, None) if not Path(file_path).exists(): _PRETTY.error("Credential file does not exist") @@ -39,7 +40,7 @@ def _extract_credentials(file_path: Optional[str]) -> Tuple[Optional[str], Optio name = read_name if read_name else None password = read_password[0] if read_password else None - return (name, password) + return UserPassAuthenticator("KIT ILIAS Shibboleth", username=name, password=password) def _resolve_remote_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: @@ -71,6 +72,8 @@ def main() -> None: parser.add_argument('--credential-file', nargs='?', default=None, help="Path to a file containing credentials for Ilias. The file must have " "one line in the following format: ':'") + parser.add_argument("-k", "--keyring", action="store_true", + help="Use the system keyring service for authentication") parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") parser.add_argument('--local-first', action="store_true", help="Don't prompt for confirmation, keep existing files") @@ -85,10 +88,21 @@ def main() -> None: cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) session = cookie_jar.create_session() - username, password = _extract_credentials(args.credential_file) - authenticator = KitShibbolethAuthenticator(username=username, password=password) + if args.keyring: + if not args.username: + _PRETTY.error("Keyring auth selected but no --username passed!") + return + inner_auth: UserPassAuthenticator = KeyringAuthenticator( + "KIT ILIAS Shibboleth", username=args.username, password=args.password + ) + else: + inner_auth = _extract_credentials(args.credential_file) + + username, password = inner_auth.get_credentials() + authenticator = KitShibbolethAuthenticator(inner_auth) url = urlparse(args.url) + crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, authenticator, lambda x, y: True) @@ -125,6 +139,7 @@ def main() -> None: file_confilict_resolver = resolve_prompt_user pferd.enable_logging() + # fetch pferd.ilias_kit_folder( target=target, From f47b137b593628e45d34f9674342c039532329e0 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 5 Dec 2020 23:35:20 +0100 Subject: [PATCH 035/524] Fix ILIAS init.py and Pferd.py authenticators --- PFERD/ilias/__init__.py | 3 +-- PFERD/pferd.py | 14 +++++++++++--- mypy.ini | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py index 379d244..0a5f08b 100644 --- a/PFERD/ilias/__init__.py +++ b/PFERD/ilias/__init__.py @@ -2,8 +2,7 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/). """ -from .authenticators import (IliasAuthenticator, KitShibbolethAuthenticator, - KeyringKitShibbolethAuthenticator) +from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, IliasElementType) from .downloader import (IliasDownloader, IliasDownloadInfo, diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 12ead8b..1bb6f78 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -6,6 +6,7 @@ import logging from pathlib import Path from typing import Callable, List, Optional, Union +from .authenticators import UserPassAuthenticator from .cookie_jar import CookieJar from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, diva_download_new) @@ -64,6 +65,13 @@ class Pferd(Location): for transformable in transformables: LOGGER.info(transformable.path) + @staticmethod + def _get_authenticator( + username: Optional[str], password: Optional[str] + ) -> KitShibbolethAuthenticator: + inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password) + return KitShibbolethAuthenticator(inner_auth) + def _ilias( self, target: PathLike, @@ -150,7 +158,7 @@ class Pferd(Location): with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. - authenticator = KitShibbolethAuthenticator(username=username, password=password) + authenticator = Pferd._get_authenticator(username=username, password=password) PRETTY.starting_synchronizer(target, "ILIAS", course_id) organizer = self._ilias( @@ -220,7 +228,7 @@ class Pferd(Location): with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. - authenticator = KitShibbolethAuthenticator(username=username, password=password) + authenticator = Pferd._get_authenticator(username, password) PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") organizer = self._ilias( @@ -285,7 +293,7 @@ class Pferd(Location): with overwriting or deleting files. The default always asks the user. """ # This authenticator only works with the KIT ilias instance. - authenticator = KitShibbolethAuthenticator(username=username, password=password) + authenticator = Pferd._get_authenticator(username=username, password=password) PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url") if not full_url.startswith("https://ilias.studium.kit.edu"): diff --git a/mypy.ini b/mypy.ini index 91792d8..60306de 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,5 +3,5 @@ disallow_untyped_defs = True disallow_incomplete_defs = True no_implicit_optional = True -[mypy-rich.*,bs4] +[mypy-rich.*,bs4,keyring] ignore_missing_imports = True From 93e6329901b83b80ccc2ec339431d2bf2e0d07f6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 6 Dec 2020 13:28:08 +0100 Subject: [PATCH 036/524] Use the least destructive conflict resolver if there are multiple --- sync_url.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sync_url.py b/sync_url.py index fe0b3c4..c6db255 100755 --- a/sync_url.py +++ b/sync_url.py @@ -129,12 +129,12 @@ def main() -> None: return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] return True - if args.remote_first: - file_confilict_resolver: FileConflictResolver = _resolve_remote_first - elif args.local_first: - file_confilict_resolver = _resolve_local_first + if args.local_first: + file_confilict_resolver: FileConflictResolver = _resolve_local_first elif args.no_delete: file_confilict_resolver = _resolve_no_delete + elif args.remote_first: + file_confilict_resolver = _resolve_remote_first else: file_confilict_resolver = resolve_prompt_user From ee39aaf08b7e031cb8d1ed5fb675c672419162dc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 7 Dec 2020 22:55:28 +0100 Subject: [PATCH 037/524] Fix merge marker in LICENSE --- LICENSE | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/LICENSE b/LICENSE index 7e4f54e..01f15f5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,4 @@ -<<<<<<< HEAD -Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe -======= -Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, Scriptim ->>>>>>> f89226c (Use system keyring service for password auth) +Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -19,4 +15,4 @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From 1c2b6bf9946e31914ba41421d33bfa83c03258c6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Dec 2020 19:57:29 +0100 Subject: [PATCH 038/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3a877c1..572528a 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.0 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.4.5 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.4.5/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.0 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.0/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 6650016..1ed2876 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.4.5", + version="2.5.0", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 9b048a9cfc43a97c7db696019ce792ed35a8a6d1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 14:32:59 +0100 Subject: [PATCH 039/524] Canonize meeting names to a properly formatted date --- PFERD/ilias/crawler.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 2e37e36..4d59dbf 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -40,6 +40,7 @@ class IliasElementType(Enum): REGULAR_FILE = "REGULAR_FILE" VIDEO_FILE = "VIDEO_FILE" FORUM = "FORUM" + MEETING = "MEETING" EXTERNAL_LINK = "EXTERNAL_LINK" def is_folder(self) -> bool: @@ -241,6 +242,8 @@ class IliasCrawler: entries_to_process += self._crawl_video_directory(entry.path, url) continue + PRETTY.warning(f"Unknown type: {entry.entry_type}!") + return result def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: @@ -269,6 +272,19 @@ class IliasCrawler: if element_type == IliasElementType.REGULAR_FILE: result += self._crawl_file(folder_path, link, abs_url) + elif element_type == IliasElementType.MEETING: + meeting_name = str(element_path.name) + date_portion_str = meeting_name.split(" - ")[0] + date_portion = demangle_date(date_portion_str) + + if not date_portion: + result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] + continue + + rest_of_name = meeting_name.removeprefix(date_portion_str) + new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + new_path = Path(folder_path, _sanitize_path_name(new_name)) + result += [IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None)] elif element_type is not None: result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] else: @@ -320,6 +336,8 @@ class IliasCrawler: """ # pylint: disable=too-many-return-statements + found_parent: Optional[bs4.Tag] = None + # We look for the outer div of our inner link, to find information around it # (mostly the icon) for parent in link_element.parents: @@ -350,6 +368,9 @@ class IliasCrawler: if str(img_tag["src"]).endswith("frm.svg"): return IliasElementType.FORUM + if str(img_tag["src"]).endswith("sess.svg"): + return IliasElementType.MEETING + return IliasElementType.REGULAR_FOLDER @staticmethod From 2714ac6be6881e7a49e59d6aa8c709700720e8e8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 14:34:11 +0100 Subject: [PATCH 040/524] Send CSRF token to Shibboleth --- PFERD/ilias/authenticators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PFERD/ilias/authenticators.py b/PFERD/ilias/authenticators.py index e70f459..4b99dd8 100644 --- a/PFERD/ilias/authenticators.py +++ b/PFERD/ilias/authenticators.py @@ -74,6 +74,8 @@ class KitShibbolethAuthenticator(IliasAuthenticator): form = soup.find("form", {"class": "full content", "method": "post"}) action = form["action"] + csrf_token = form.find("input", {"name": "csrf_token"})["value"] + # Equivalent: Enter credentials in # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO LOGGER.debug("Attempt to log in to Shibboleth using credentials") @@ -82,6 +84,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator): "_eventId_proceed": "", "j_username": self._auth.username, "j_password": self._auth.password, + "csrf_token": csrf_token } soup = soupify(sess.post(url, data=data)) From c978e9edf462d6aafd71acb29f714c8a677c2fb5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 14:45:46 +0100 Subject: [PATCH 041/524] Resolve a few pylint warnings --- PFERD/authenticators.py | 2 +- PFERD/ilias/crawler.py | 7 +++++-- PFERD/logging.py | 5 +---- PFERD/organizer.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/authenticators.py b/PFERD/authenticators.py index 5537cc1..f85c9d3 100644 --- a/PFERD/authenticators.py +++ b/PFERD/authenticators.py @@ -14,7 +14,7 @@ PRETTY = PrettyLogger(LOGGER) try: import keyring except ImportError: - PRETTY.warning("Keyring module not found, KeyringAuthenticator won't work!") + pass class TfaAuthenticator: diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 4d59dbf..86bf045 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -282,9 +282,12 @@ class IliasCrawler: continue rest_of_name = meeting_name.removeprefix(date_portion_str) - new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") \ + + rest_of_name new_path = Path(folder_path, _sanitize_path_name(new_name)) - result += [IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None)] + result += [ + IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None) + ] elif element_type is not None: result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] else: diff --git a/PFERD/logging.py b/PFERD/logging.py index 76741f7..c25019e 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -3,13 +3,10 @@ Contains a few logger utility functions and implementations. """ import logging -from pathlib import Path -from typing import List, Optional +from typing import Optional -from rich import print as rich_print from rich._log_render import LogRender from rich.console import Console -from rich.panel import Panel from rich.style import Style from rich.text import Text from rich.theme import Theme diff --git a/PFERD/organizer.py b/PFERD/organizer.py index a15e751..fe5052b 100644 --- a/PFERD/organizer.py +++ b/PFERD/organizer.py @@ -116,7 +116,7 @@ class Organizer(Location): if self._is_marked(dst): PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") conflict = ConflictType.MARKED_FILE_OVERWRITTEN - if self._resolve_conflict(f"Overwrite file?", dst_absolute, conflict, default=False): + if self._resolve_conflict("Overwrite file?", dst_absolute, conflict, default=False): PRETTY.ignored_file(dst_absolute, "file was written previously") return None From 0e1077bb50618ff144f7aab463448b0fb9f4d770 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 14:50:49 +0100 Subject: [PATCH 042/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 572528a..59aaaa2 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.0 +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.1 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.0 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.0/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.1 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.1/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 1ed2876..e57fc75 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.5.0", + version="2.5.1", packages=find_packages(), install_requires=[ "requests>=2.21.0", From f0562049b6e681f60bd4465c0d8610675d6aaaa8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 17:18:04 +0100 Subject: [PATCH 043/524] Remove Python 3.9 method in crawler --- PFERD/ilias/crawler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 86bf045..93b626e 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -281,7 +281,10 @@ class IliasCrawler: result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] continue - rest_of_name = meeting_name.removeprefix(date_portion_str) + rest_of_name = meeting_name + if rest_of_name.startswith(date_portion_str): + rest_of_name = rest_of_name[len(date_portion_str):] + new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") \ + rest_of_name new_path = Path(folder_path, _sanitize_path_name(new_name)) From 5de68a0400e478a9912eb6b78e317ad9f5ee8eb1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 30 Dec 2020 17:20:30 +0100 Subject: [PATCH 044/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 59aaaa2..2b760e0 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.1 +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.2 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.1 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.1/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.2 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.2/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index e57fc75..8d672a4 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.5.1", + version="2.5.2", packages=find_packages(), install_requires=[ "requests>=2.21.0", From fb78a6e98e972dfccd7b367810f26d68d743e088 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 6 Jan 2021 12:29:24 +0100 Subject: [PATCH 045/524] Retry ILIAS downloads a few times and only fail that file --- PFERD/ilias/downloader.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py index 82527a0..26e1f2d 100644 --- a/PFERD/ilias/downloader.py +++ b/PFERD/ilias/downloader.py @@ -122,9 +122,22 @@ class IliasDownloader: tmp_file = self._tmp_dir.new_path() - while not self._try_download(info, tmp_file): - LOGGER.info("Retrying download: %r", info) - self._authenticator.authenticate(self._session) + download_successful = False + for _ in range(0, 3): + try: + if not self._try_download(info, tmp_file): + LOGGER.info("Re-Authenticating due to download failure: %r", info) + self._authenticator.authenticate(self._session) + else: + download_successful = True + break + except IOError as e: + PRETTY.warning(f"I/O Error when downloading ({e}). Retrying...",) + LOGGER.info("Retrying download for %s", info.path) + + if not download_successful: + PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...") + return dst_path = self._organizer.accept_file(tmp_file, info.path) if dst_path and info.modification_date: From 0b606f02fa1a791eb4f19f5809452624f0c89aaa Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 17 Jan 2021 10:33:10 +0100 Subject: [PATCH 046/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2b760e0..ed92500 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.2 +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.3 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.2 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.2/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.3 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.3/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 8d672a4..78f82be 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.5.2", + version="2.5.3", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 35c3fa205ddd44ac8ccbd989155b6a0074323169 Mon Sep 17 00:00:00 2001 From: Toorero Date: Thu, 28 Jan 2021 21:24:09 +0100 Subject: [PATCH 047/524] Fixed description of activating venv (#22) Add 'source' to the venv activate command in the readme `source` was picked over `.` to conform to the python recommendation (https://docs.python.org/3/library/venv.html#module-venv). This patch also adds the `egg-info` you get when building to the gitignore. --- .gitignore | 1 + README.md | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index fbb852b..a5f87ba 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ build/ .env .vscode ilias_cookies.txt +PFERD.egg-info/ # PyInstaller sync_url.spec diff --git a/README.md b/README.md index ed92500..44138db 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ A full example setup and initial use could look like: $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv -$ .venv/bin/activate +$ source .venv/bin/activate $ pip install git+https://github.com/Garmelon/PFERD@v2.5.3 $ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.3/example_config.py $ python3 example_config.py @@ -69,7 +69,7 @@ $ deactivate Subsequent runs of the program might look like: ``` $ cd Vorlesungen -$ .venv/bin/activate +$ source .venv/bin/activate $ python3 example_config.py $ deactivate ``` From 83b75e8254d5ed36a629ac35e901772c66066691 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Feb 2021 22:51:08 +0100 Subject: [PATCH 048/524] syncurl: Sanitize element name on windows if it is used as folder name Otherwise the name of the course might not be a invalid file name. --- sync_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index c6db255..ab079c3 100755 --- a/sync_url.py +++ b/sync_url.py @@ -113,7 +113,7 @@ def main() -> None: if not element_name: print("Error, could not get element name. Please specify a folder yourself.") return - folder = Path(element_name) + folder = sanitize_windows_path(Path(element_name.replace("/", "-").replace("\\", "-"))) cookie_jar.save_cookies() else: folder = Path(args.folder) From 9a9018751ec3dcbd62a4334cca906d82422909ba Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Feb 2021 22:54:05 +0100 Subject: [PATCH 049/524] Bump version --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 44138db..221e8c4 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.3 +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.4 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ source .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.3 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.3/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.5.4 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.4/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 78f82be..70a9107 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.5.3", + version="2.5.4", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 946b7a7931c8dc5c70edbc86e45d5d8e96b638a4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 9 Feb 2021 12:30:59 +0100 Subject: [PATCH 050/524] Also crawl .c/.java/.zip from IPD page --- PFERD/ipd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/ipd.py b/PFERD/ipd.py index d602e0e..ece6a97 100644 --- a/PFERD/ipd.py +++ b/PFERD/ipd.py @@ -82,7 +82,10 @@ class IpdCrawler: items: List[IpdDownloadInfo] = [] - for link in page.findAll(name="a", attrs={"href": lambda x: x and x.endswith("pdf")}): + def is_relevant_url(x: str) -> bool: + return x.endswith(".pdf") or x.endswith(".c") or x.endswith(".java") or x.endswith(".zip") + + for link in page.findAll(name="a", attrs={"href": lambda x: x and is_relevant_url(x)}): href: str = link.attrs.get("href") name = href.split("/")[-1] From e2bf84392bcbe89ae60d771705af014495343b27 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 8 Apr 2021 18:12:27 +0200 Subject: [PATCH 051/524] [sync_url] Properly declare "no-videos" as flag --- sync_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index ab079c3..dd88bf7 100755 --- a/sync_url.py +++ b/sync_url.py @@ -74,7 +74,7 @@ def main() -> None: "one line in the following format: ':'") parser.add_argument("-k", "--keyring", action="store_true", help="Use the system keyring service for authentication") - parser.add_argument('--no-videos', nargs='?', default=None, help="Don't download videos") + parser.add_argument('--no-videos', action="store_true", help="Don't download videos") parser.add_argument('--local-first', action="store_true", help="Don't prompt for confirmation, keep existing files") parser.add_argument('--remote-first', action="store_true", From 14cdfb6a690d55a0e83c028ec857c4aab7686d93 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 13 Apr 2021 11:19:51 +0200 Subject: [PATCH 052/524] Fix typo in date demangler doc --- PFERD/ilias/date_demangler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/ilias/date_demangler.py b/PFERD/ilias/date_demangler.py index 9c1fc8d..2950d4d 100644 --- a/PFERD/ilias/date_demangler.py +++ b/PFERD/ilias/date_demangler.py @@ -20,7 +20,7 @@ def demangle_date(date: str) -> Optional[datetime.datetime]: "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" - "dd. mon.yyyy, HH:MM + "dd. mon yyyy, HH:MM """ saved = locale.setlocale(locale.LC_ALL) try: From 1f2af3a2909e1979d22834652e28662ba4db754b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 13 Apr 2021 11:32:55 +0200 Subject: [PATCH 053/524] Retry on more I/O Errors --- PFERD/errors.py | 18 ++++++++++++++++++ PFERD/ilias/crawler.py | 3 ++- PFERD/ilias/downloader.py | 24 +++++++++++------------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/PFERD/errors.py b/PFERD/errors.py index d003314..d960e13 100644 --- a/PFERD/errors.py +++ b/PFERD/errors.py @@ -37,3 +37,21 @@ def swallow_and_print_errors(function: TFun) -> TFun: Console().print_exception() return None return cast(TFun, inner) + + +def retry_on_io_exception(max_retries: int, message: str) -> Callable[[TFun], TFun]: + """ + Decorates a function and retries it on any exception until the max retries count is hit. + """ + def retry(function: TFun) -> TFun: + def inner(*args: Any, **kwargs: Any) -> Any: + for i in range(0, max_retries): + # pylint: disable=broad-except + try: + return function(*args, **kwargs) + except IOError as error: + PRETTY.warning(f"Error duing operation '{message}': {error}") + PRETTY.warning( + f"Retrying operation '{message}'. Remaining retries: {max_retries - 1 - i}") + return cast(TFun, inner) + return retry diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 93b626e..edab284 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -15,7 +15,7 @@ from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, import bs4 import requests -from ..errors import FatalException +from ..errors import FatalException, retry_on_io_exception from ..logging import PrettyLogger from ..utils import soupify from .authenticators import IliasAuthenticator @@ -625,6 +625,7 @@ class IliasCrawler: return results + @retry_on_io_exception(3, "fetching webpage") def _get_page(self, url: str, params: Dict[str, Any], retry_count: int = 0) -> bs4.BeautifulSoup: """ diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py index 26e1f2d..f6132bf 100644 --- a/PFERD/ilias/downloader.py +++ b/PFERD/ilias/downloader.py @@ -10,6 +10,7 @@ from typing import Callable, List, Optional, Union import bs4 import requests +from ..errors import retry_on_io_exception from ..logging import PrettyLogger from ..organizer import Organizer from ..tmp_dir import TmpDir @@ -116,26 +117,23 @@ class IliasDownloader: """ LOGGER.debug("Downloading %r", info) + if not self._strategy(self._organizer, info): self._organizer.mark(info.path) return tmp_file = self._tmp_dir.new_path() - download_successful = False - for _ in range(0, 3): - try: - if not self._try_download(info, tmp_file): - LOGGER.info("Re-Authenticating due to download failure: %r", info) - self._authenticator.authenticate(self._session) - else: - download_successful = True - break - except IOError as e: - PRETTY.warning(f"I/O Error when downloading ({e}). Retrying...",) - LOGGER.info("Retrying download for %s", info.path) + @retry_on_io_exception(3, "downloading file") + def download_impl() -> bool: + if not self._try_download(info, tmp_file): + LOGGER.info("Re-Authenticating due to download failure: %r", info) + self._authenticator.authenticate(self._session) + raise IOError("Scheduled retry") + else: + return True - if not download_successful: + if not download_impl(): PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...") return From 4f480d117e11f9fc2c04e7674efb77b0899619b6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 14 Apr 2021 19:24:05 +0200 Subject: [PATCH 054/524] Install keyring in CI --- .github/workflows/package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 1c0c353..615917b 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -23,7 +23,7 @@ jobs: python-version: '3.x' - name: "Install dependencies" - run: "pip install setuptools pyinstaller rich requests beautifulsoup4 -f --upgrade" + run: "pip install setuptools keyring pyinstaller rich requests beautifulsoup4 -f --upgrade" - name: "Install sync_url.py" run: "pyinstaller sync_url.py -F" From 80ae5ddfaa87f3d5c7fe54f656c0083b9d818f82 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 14 Apr 2021 19:47:41 +0200 Subject: [PATCH 055/524] Bump version to v2.6.0 --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 221e8c4..b01bbc9 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.4 +$ pip install git+https://github.com/Garmelon/PFERD@v2.6.0 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ source .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.5.4 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.5.4/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.6.0 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.6.0/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index 70a9107..bdb7754 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.5.4", + version="2.6.0", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 7cc40595dc0fcd4e05a48f5ce8ba7d77f322a284 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 14 Apr 2021 20:25:25 +0200 Subject: [PATCH 056/524] Allow synchronizing to directory "." --- sync_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index dd88bf7..06a94b3 100755 --- a/sync_url.py +++ b/sync_url.py @@ -121,7 +121,7 @@ def main() -> None: # files may not escape the pferd_root with relative paths # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path pferd_root = Path(Path.cwd(), Path(folder)).parent - target = folder.name + target = folder.resolve().name pferd = Pferd(pferd_root, test_run=args.test_run) def dir_filter(_: Path, element: IliasElementType) -> bool: From 6d5d9333ad7f8aed4fdce2203134989beb883df9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 19 Apr 2021 11:07:25 +0200 Subject: [PATCH 057/524] Force folder to be file-system path --- sync_url.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sync_url.py b/sync_url.py index 06a94b3..ca78de0 100755 --- a/sync_url.py +++ b/sync_url.py @@ -121,7 +121,8 @@ def main() -> None: # files may not escape the pferd_root with relative paths # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path pferd_root = Path(Path.cwd(), Path(folder)).parent - target = folder.resolve().name + # Folder might be a *PurePath* at this point + target = Path(folder).resolve().name pferd = Pferd(pferd_root, test_run=args.test_run) def dir_filter(_: Path, element: IliasElementType) -> bool: From 29cd5d1a3c4cab259636d6ab1e42f38c7a718792 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 19 Apr 2021 11:10:02 +0200 Subject: [PATCH 058/524] Reflect totality of sanitize_windows_path in return type --- PFERD/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/transform.py b/PFERD/transform.py index 7a05dd1..a2152ba 100644 --- a/PFERD/transform.py +++ b/PFERD/transform.py @@ -130,7 +130,7 @@ def re_rename(regex: Regex, target: str) -> Transform: return inner -def sanitize_windows_path(path: PurePath) -> Optional[PurePath]: +def sanitize_windows_path(path: PurePath) -> PurePath: """ A small function to escape characters that are forbidden in windows path names. This method is a no-op on other operating systems. From c1ab7485e248c9bbcfa40405ba9a2dd1713784d1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 19 Apr 2021 11:21:56 +0200 Subject: [PATCH 059/524] Bump version to 2.6.1 --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b01bbc9..178fbac 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Ensure that you have at least Python 3.8 installed. To install PFERD or update your installation to the latest version, run this wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.6.0 +$ pip install git+https://github.com/Garmelon/PFERD@v2.6.1 ``` The use of [venv] is recommended. @@ -60,8 +60,8 @@ $ mkdir Vorlesungen $ cd Vorlesungen $ python3 -m venv .venv $ source .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.6.0 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.6.0/example_config.py +$ pip install git+https://github.com/Garmelon/PFERD@v2.6.1 +$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.6.1/example_config.py $ python3 example_config.py $ deactivate ``` diff --git a/setup.py b/setup.py index bdb7754..a4dfab3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( name="PFERD", - version="2.6.0", + version="2.6.1", packages=find_packages(), install_requires=[ "requests>=2.21.0", From 27e4abcfa32309eb3dd61ce20a4d65fa22a477a1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 26 Apr 2021 23:46:44 +0200 Subject: [PATCH 060/524] Do project setup from scratch Following guidelines from the Python Packaging User Guide [1]. This commit intentionally breaks the .gitignore, project dependencies, GitHub Actions and other stuff. It also removes almost the entire README. The intention behind this is to get rid of all cruft that as accumulated over time and to have a fresh start. Only necessary things will be re-added as they're needed. From now on, I also plan on adding documentation for every feature at the same time that the feature is implemented. This is to ensure that the documentation does not become outdated. [1]: https://packaging.python.org/ --- .github/workflows/package.yml | 74 --------- .gitignore | 17 +- DEV.md | 37 +++++ README.md | 251 +---------------------------- example_config.py | 131 --------------- example_config_personal_desktop.py | 38 ----- pyproject.toml | 3 + requirements.txt | 4 - setup.cfg | 7 + setup.py | 17 -- 10 files changed, 56 insertions(+), 523 deletions(-) delete mode 100644 .github/workflows/package.yml create mode 100644 DEV.md delete mode 100644 example_config.py delete mode 100644 example_config_personal_desktop.py create mode 100644 pyproject.toml delete mode 100644 requirements.txt create mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml deleted file mode 100644 index 615917b..0000000 --- a/.github/workflows/package.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Package Application with Pyinstaller - -on: - push: - branches: - - "*" - tags: - - "v*" - -jobs: - build: - - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: "Install dependencies" - run: "pip install setuptools keyring pyinstaller rich requests beautifulsoup4 -f --upgrade" - - - name: "Install sync_url.py" - run: "pyinstaller sync_url.py -F" - - - name: "Move artifact" - run: "mv dist/sync_url* dist/sync_url-${{ matrix.os }}" - - - uses: actions/upload-artifact@v2 - with: - name: "Pferd Sync URL" - path: "dist/sync_url*" - - release: - name: Release - - needs: [build] - runs-on: ubuntu-latest - if: startsWith(github.ref, 'refs/tags/') - - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - steps: - - name: "Checkout" - uses: actions/checkout@v2 - - - name: "Download artifacts" - uses: actions/download-artifact@v2 - with: - name: "Pferd Sync URL" - - - name: "look at folder structure" - run: "ls -lah" - - - name: "Rename releases" - run: "mv sync_url-macos-latest pferd_sync_url_mac && mv sync_url-ubuntu-latest pferd_sync_url_linux && mv sync_url-windows-latest pferd_sync_url.exe" - - - name: "Create release" - uses: softprops/action-gh-release@v1 - - - name: "Upload release artifacts" - uses: softprops/action-gh-release@v1 - with: - body: "Download the correct sync_url for your platform and run it in the terminal or CMD. You might need to make it executable on Linux/Mac with `chmod +x `. Also please enclose the *url you pass to the program in double quotes* or your shell might silently screw it up!" - files: | - pferd_sync_url_mac - pferd_sync_url_linux - pferd_sync_url.exe diff --git a/.gitignore b/.gitignore index a5f87ba..bd8bab9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,2 @@ -__pycache__/ -.venv/ -venv/ -.idea/ -build/ -.mypy_cache/ -.tmp/ -.env -.vscode -ilias_cookies.txt -PFERD.egg-info/ - -# PyInstaller -sync_url.spec -dist/ +/.mypy_cache/ +/.venv/ diff --git a/DEV.md b/DEV.md new file mode 100644 index 0000000..a679b4a --- /dev/null +++ b/DEV.md @@ -0,0 +1,37 @@ +# PFERD Development Guide + +PFERD is packaged following the [Python Packaging User Guide][ppug] (in +particular [this][ppug-1] and [this][ppug-2] guide). + +[ppug]: "Python Packaging User Guide" +[ppug-1]: "Packaging Python Projects" +[ppug-2]: "Packaging and distributing projects" + +## Setting up a dev environment + +The use of [venv][venv] is recommended. To initially set up a development +environment, run these commands in the same directory as this file: + +``` +$ python -m venv .venv +$ . .venv/bin/activate +$ pip install --editable . +``` + +After this, you can use PFERD as if it was installed normally. Since PFERD was +installed with `--editable`, there is no need to re-run `pip install` when the +source code is changed. + +For more details, see [this part of the Python Tutorial][venv-tut] and +[this section on "development mode"][ppug-dev]. + +[venv]: "venv - Creation of virtual environments" +[venv-tut]: "12. Virtual Environments and Packages" +[ppug-dev]: "Working in “development mode”" + +## Contributing + +When submitting a PR that adds, changes or modifies a feature, please ensure +that the corresponding documentation is updated. + +In your first PR, please add your name to the `LICENSE` file. diff --git a/README.md b/README.md index 178fbac..5b74de5 100644 --- a/README.md +++ b/README.md @@ -2,254 +2,17 @@ **P**rogramm zum **F**lotten, **E**infachen **R**unterladen von **D**ateien -- [Quickstart with `sync_url`](#quickstart-with-sync_url) -- [Installation](#installation) - - [Upgrading from 2.0.0 to 2.1.0+](#upgrading-from-200-to-210) -- [Example setup](#example-setup) -- [Usage](#usage) - - [General concepts](#general-concepts) - - [Constructing transforms](#constructing-transforms) - - [Transform creators](#transform-creators) - - [Transform combinators](#transform-combinators) - - [A short, but commented example](#a-short-but-commented-example) +Other resources: -## Quickstart with `sync_url` +- [Development Guide](DEV.md) -The `sync_url` program allows you to just synchronize a given ILIAS URL (of a -course, a folder, your personal desktop, etc.) without any extra configuration -or setting up. Download the program, open ILIAS, copy the URL from the address -bar and pass it to sync_url. +## Installation with pip -It bundles everything it needs in one executable and is easy to -use, but doesn't expose all the configuration options and tweaks a full install -does. +Ensure you have at least Python 3.8 installed. Run the following command to +install PFERD or upgrade it to the latest version: -1. Download the `sync_url` binary from the [latest release](https://github.com/Garmelon/PFERD/releases/latest). -2. Recognize that you most likely need to enclose the URL in `""` quotes to prevent your shell from interpreting `&` and other symbols -3. Run the binary in your terminal (`./sync_url` or `sync_url.exe` in the CMD) to see the help and use it. I'd recommend using the `--cookies` option. - If you are on **Linux/Mac**, you need to *make the file executable* using `chmod +x `. - If you are on **Mac**, you need to allow this unverified program to run (see e.g. [here](https://www.switchingtomac.com/tutorials/osx/how-to-run-unverified-apps-on-macos/)) - -## Installation - -Ensure that you have at least Python 3.8 installed. - -To install PFERD or update your installation to the latest version, run this -wherever you want to install or have already installed PFERD: ``` -$ pip install git+https://github.com/Garmelon/PFERD@v2.6.1 +$ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest ``` -The use of [venv] is recommended. - -[venv]: https://docs.python.org/3/library/venv.html - -### Upgrading from 2.0.0 to 2.1.0+ - -- The `IliasDirectoryType` type was renamed to `IliasElementType` and is now far more detailed. - The new values are: `REGULAR_FOLDER`, `VIDEO_FOLDER`, `EXERCISE_FOLDER`, `REGULAR_FILE`, `VIDEO_FILE`, `FORUM`, `EXTERNAL_LINK`. -- Forums and external links are skipped automatically if you use the `kit_ilias` helper. - -## Example setup - -In this example, `python3` refers to at least Python 3.8. - -A full example setup and initial use could look like: -``` -$ mkdir Vorlesungen -$ cd Vorlesungen -$ python3 -m venv .venv -$ source .venv/bin/activate -$ pip install git+https://github.com/Garmelon/PFERD@v2.6.1 -$ curl -O https://raw.githubusercontent.com/Garmelon/PFERD/v2.6.1/example_config.py -$ python3 example_config.py -$ deactivate -``` - -Subsequent runs of the program might look like: -``` -$ cd Vorlesungen -$ source .venv/bin/activate -$ python3 example_config.py -$ deactivate -``` - -If you just want to get started and crawl *your entire ILIAS Desktop* instead -of a given set of courses, please replace `example_config.py` with -`example_config_personal_desktop.py` in all of the instructions below (`curl` call and -`python3` run command). - -## Usage - -### General concepts - -A PFERD config is a normal python file that starts multiple *synchronizers* -which do all the heavy lifting. While you can create and wire them up manually, -you are encouraged to use the helper methods provided in `PFERD.Pferd`. - -The synchronizers take some input arguments specific to their service and a -*transform*. The transform receives the computed path of an element in ILIAS and -can return either an output path (so you can rename files or move them around as -you wish) or `None` if you do not want to save the given file. - -Additionally the ILIAS synchronizer allows you to define a *crawl filter*. This -filter also receives the computed path as the input, but is only called for -*directories*. If you return `True`, the directory will be crawled and -searched. If you return `False` the directory will be ignored and nothing in it -will be passed to the transform. - -### Constructing transforms - -While transforms are just normal python functions, writing them by hand can -quickly become tedious. In order to help you with writing your own transforms -and filters, PFERD defines a few useful transform creators and combinators in -the `PFERD.transform` module: - -#### Transform creators - -These methods let you create a few basic transform building blocks: - -- **`glob(glob)`** - Creates a transform that returns the unchanged path if the glob matches the path and `None` otherwise. - See also [Path.match]. - Example: `glob("Übung/*.pdf")` -- **`predicate(pred)`** - Creates a transform that returns the unchanged path if `pred(path)` returns a truthy value. - Returns `None` otherwise. - Example: `predicate(lambda path: len(path.parts) == 3)` -- **`move_dir(source, target)`** - Creates a transform that moves all files from the `source` to the `target` directory. - Example: `move_dir("Übung/", "Blätter/")` -- **`move(source, target)`** - Creates a transform that moves the `source` file to `target`. - Example: `move("Vorlesung/VL02_Automten.pdf", "Vorlesung/VL02_Automaten.pdf")` -- **`rename(source, target)`** - Creates a transform that renames all files named `source` to `target`. - This transform works on the file names, not paths, and thus works no matter where the file is located. - Example: `rename("VL02_Automten.pdf", "VL02_Automaten.pdf")` -- **`re_move(regex, target)`** - Creates a transform that moves all files matching `regex` to `target`. - The transform `str.format` on the `target` string with the contents of the capturing groups before returning it. - The capturing groups can be accessed via their index. - See also [Match.group]. - Example: `re_move(r"Übung/Blatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf")` -- **`re_rename(regex, target)`** - Creates a transform that renames all files matching `regex` to `target`. - This transform works on the file names, not paths, and thus works no matter where the file is located. - Example: `re_rename(r"VL(\d+)(.*)\.pdf", "Vorlesung_Nr_{1}__{2}.pdf")` - -All movement or rename transforms above return `None` if a file doesn't match -their movement or renaming criteria. This enables them to be used as building -blocks to build up more complex transforms. - -In addition, `PFERD.transform` also defines the `keep` transform which returns its input path unchanged. -This behaviour can be very useful when creating more complex transforms. -See below for example usage. - -[Path.match]: https://docs.python.org/3/library/pathlib.html#pathlib.Path.match -[Match.group]: https://docs.python.org/3/library/re.html#re.Match.group - -#### Transform combinators - -These methods let you combine transforms into more complex transforms: - -- **`optionally(transform)`** - Wraps a given transform and returns its result if it is not `None`. - Otherwise returns the input path unchanged. - See below for example usage. -* **`do(transforms)`** - Accepts a series of transforms and applies them in the given order to the result of the previous one. - If any transform returns `None`, `do` short-circuits and also returns `None`. - This can be used to perform multiple renames in a row: - ```py - do( - # Move them - move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), - # Fix extensions (if they have any) - optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), - # Remove the 'dbs' prefix (if they have any) - optionally(re_rename("(?i)dbs-(.+)", "{1}")), - ) - ``` -- **`attempt(transforms)`** - Applies the passed transforms in the given order until it finds one that does not return `None`. - If it does not find any, it returns `None`. - This can be used to give a list of possible transformations and automatically pick the first one that fits: - ```py - attempt( - # Move all videos. If a video is passed in, this `re_move` will succeed - # and attempt short-circuits with the result. - re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), - # Move the whole folder to a nicer name - now without any mp4! - move_dir("Vorlesungsmaterial/", "Vorlesung/"), - # If we got another file, keep it. - keep, - ) - ``` - -All of these combinators are used in the provided example configs, if you want -to see some more real-life usages. - -### A short, but commented example - -```py -from pathlib import Path, PurePath -from PFERD import Pferd -from PFERD.ilias import IliasElementType -from PFERD.transform import * - -# This filter will later be used by the ILIAS crawler to decide whether it -# should crawl a directory (or directory-like structure). -def filter_course(path: PurePath, type: IliasElementType) -> bool: - # Note that glob returns a Transform, which is a function from PurePath -> - # Optional[PurePath]. Because of this, we need to apply the result of - # 'glob' to our input path. The returned value will be truthy (a Path) if - # the transform succeeded, or `None` if it failed. - - # We need to crawl the 'Tutorien' folder as it contains one that we want. - if glob("Tutorien/")(path): - return True - # If we found 'Tutorium 10', keep it! - if glob("Tutorien/Tutorium 10")(path): - return True - # Discard all other folders inside 'Tutorien' - if glob("Tutorien/*")(path): - return False - - # All other dirs (including subdirs of 'Tutorium 10') should be searched :) - return True - - -# This transform will later be used to rename a few files. It can also be used -# to ignore some files. -transform_course = attempt( - # We don't care about the other tuts and would instead prefer a cleaner - # directory structure. - move_dir("Tutorien/Tutorium 10/", "Tutorium/"), - # We don't want to modify any other files, so we're going to keep them - # exactly as they are. - keep -) - -# Enable and configure the text output. Needs to be called before calling any -# other PFERD methods. -Pferd.enable_logging() -# Create a Pferd instance rooted in the same directory as the script file. This -# is not a test run, so files will be downloaded (default, can be omitted). -pferd = Pferd(Path(__file__).parent, test_run=False) - -# Use the ilias_kit helper to synchronize an ILIAS course -pferd.ilias_kit( - # The directory that all of the downloaded files should be placed in - "My_cool_course/", - # The course ID (found in the URL when on the course page in ILIAS) - "course id", - # A path to a cookie jar. If you synchronize multiple ILIAS courses, - # setting this to a common value requires you to only log in once. - cookies=Path("ilias_cookies.txt"), - # A transform can rename, move or filter out certain files - transform=transform_course, - # A crawl filter limits what paths the cralwer searches - dir_filter=filter_course, -) -``` +The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. diff --git a/example_config.py b/example_config.py deleted file mode 100644 index bffecfb..0000000 --- a/example_config.py +++ /dev/null @@ -1,131 +0,0 @@ -import argparse -from pathlib import Path, PurePath - -from PFERD import Pferd -from PFERD.ilias import IliasElementType -from PFERD.transform import (attempt, do, glob, keep, move, move_dir, - optionally, re_move, re_rename) - -tf_ss_2020_numerik = attempt( - re_move(r"Übungsblätter/(\d+)\. Übungsblatt/.*", "Blätter/Blatt_{1:0>2}.pdf"), - keep, -) - - -tf_ss_2020_db = attempt( - move_dir("Begrüßungsvideo/", "Vorlesung/Videos/"), - do( - move_dir("Vorlesungsmaterial/Vorlesungsvideos/", "Vorlesung/Videos/"), - optionally(re_rename("(.*).m4v.mp4", "{1}.mp4")), - optionally(re_rename("(?i)dbs-(.+)", "{1}")), - ), - move_dir("Vorlesungsmaterial/", "Vorlesung/"), - keep, -) - - -tf_ss_2020_rechnernetze = attempt( - re_move(r"Vorlesungsmaterial/.*/(.+?)\.mp4", "Vorlesung/Videos/{1}.mp4"), - move_dir("Vorlesungsmaterial/", "Vorlesung/"), - keep, -) - - -tf_ss_2020_sicherheit = attempt( - move_dir("Vorlesungsvideos/", "Vorlesung/Videos/"), - move_dir("Übungsvideos/", "Übung/Videos/"), - re_move(r"VL(.*)\.pdf", "Vorlesung/{1}.pdf"), - re_move(r"Übungsblatt (\d+)\.pdf", "Blätter/Blatt_{1:0>2}.pdf"), - move("Chiffrat.txt", "Blätter/Blatt_01_Chiffrat.txt"), - keep, -) - - -tf_ss_2020_pg = attempt( - move_dir("Vorlesungsaufzeichnungen/", "Vorlesung/Videos/"), - move_dir("Vorlesungsmaterial/", "Vorlesung/"), - re_move(r"Übungen/uebungsblatt(\d+).pdf", "Blätter/Blatt_{1:0>2}.pdf"), - keep, -) - - -def df_ss_2020_or1(path: PurePath, _type: IliasElementType) -> bool: - if glob("Tutorien/")(path): - return True - if glob("Tutorien/Tutorium 10, dienstags 15:45 Uhr/")(path): - return True - if glob("Tutorien/*")(path): - return False - return True - - -tf_ss_2020_or1 = attempt( - move_dir("Vorlesung/Unbeschriebene Folien/", "Vorlesung/Folien/"), - move_dir("Video zur Organisation/", "Vorlesung/Videos/"), - keep, -) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--test-run", action="store_true") - parser.add_argument("synchronizers", nargs="*") - args = parser.parse_args() - - pferd = Pferd(Path(__file__).parent, test_run=args.test_run) - pferd.enable_logging() - - if not args.synchronizers or "numerik" in args.synchronizers: - pferd.ilias_kit( - target="Numerik", - course_id="1083036", - transform=tf_ss_2020_numerik, - cookies="ilias_cookies.txt", - ) - - if not args.synchronizers or "db" in args.synchronizers: - pferd.ilias_kit( - target="DB", - course_id="1101554", - transform=tf_ss_2020_db, - cookies="ilias_cookies.txt", - ) - - if not args.synchronizers or "rechnernetze" in args.synchronizers: - pferd.ilias_kit( - target="Rechnernetze", - course_id="1099996", - transform=tf_ss_2020_rechnernetze, - cookies="ilias_cookies.txt", - ) - - if not args.synchronizers or "sicherheit" in args.synchronizers: - pferd.ilias_kit( - target="Sicherheit", - course_id="1101980", - transform=tf_ss_2020_sicherheit, - cookies="ilias_cookies.txt", - ) - - if not args.synchronizers or "pg" in args.synchronizers: - pferd.ilias_kit( - target="PG", - course_id="1106095", - transform=tf_ss_2020_pg, - cookies="ilias_cookies.txt", - ) - - if not args.synchronizers or "or1" in args.synchronizers: - pferd.ilias_kit( - target="OR1", - course_id="1105941", - dir_filter=df_ss_2020_or1, - transform=tf_ss_2020_or1, - cookies="ilias_cookies.txt", - ) - - # Prints a summary listing all new, modified or deleted files - pferd.print_summary() - -if __name__ == "__main__": - main() diff --git a/example_config_personal_desktop.py b/example_config_personal_desktop.py deleted file mode 100644 index 8d481b4..0000000 --- a/example_config_personal_desktop.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -This is a small config that just crawls the ILIAS Personal Desktop. -It does not filter or rename anything, it just gobbles up everything it can find. - -Note that this still includes a test-run switch, so you can see what it *would* download. -You can enable that with the "--test-run" command line switch, -i. e. "python3 example_config_minimal.py --test-run". -""" - -import argparse -from pathlib import Path - -from PFERD import Pferd - - -def main() -> None: - # Parse command line arguments - parser = argparse.ArgumentParser() - parser.add_argument("--test-run", action="store_true") - args = parser.parse_args() - - # Create the Pferd helper instance - pferd = Pferd(Path(__file__).parent, test_run=args.test_run) - pferd.enable_logging() - - # Synchronize the personal desktop into the "ILIAS" directory. - # It saves the cookies, so you only need to log in again when the ILIAS cookies expire. - pferd.ilias_kit_personal_desktop( - "ILIAS", - cookies="ilias_cookies.txt", - ) - - # Prints a summary listing all new, modified or deleted files - pferd.print_summary() - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9787c3b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2d852e1..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -requests>=2.21.0 -beautifulsoup4>=4.7.1 -rich>=2.1.0 -keyring>=21.5.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..6d01c03 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[metadata] +name = PFERD +version = 3.0.0 + +[options] +packages = PFERD +python_requires = >=3.8 diff --git a/setup.py b/setup.py deleted file mode 100644 index a4dfab3..0000000 --- a/setup.py +++ /dev/null @@ -1,17 +0,0 @@ -from setuptools import find_packages, setup - -setup( - name="PFERD", - version="2.6.1", - packages=find_packages(), - install_requires=[ - "requests>=2.21.0", - "beautifulsoup4>=4.7.1", - "rich>=2.1.0", - "keyring>=21.5.0" - ], -) - -# When updating the version, also: -# - update the README.md installation instructions -# - set a tag on the update commit From 5595a908d883f1f3da5a14790017085002e5d3e4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 27 Apr 2021 00:29:42 +0200 Subject: [PATCH 061/524] Configure entry point --- .gitignore | 2 ++ PFERD/__init__.py | 10 ++-------- setup.cfg | 4 ++++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index bd8bab9..9ee2832 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /.mypy_cache/ /.venv/ +/PFERD.egg-info/ +__pycache__/ diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 059f585..b657171 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,8 +1,2 @@ -# pylint: disable=invalid-name - -""" -This module exports only what you need for a basic configuration. If you want a -more complex configuration, you need to import the other submodules manually. -""" - -from .pferd import Pferd +def main() -> None: + print("Hello world") diff --git a/setup.cfg b/setup.cfg index 6d01c03..db60477 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,3 +5,7 @@ version = 3.0.0 [options] packages = PFERD python_requires = >=3.8 + +[options.entry_points] +console_scripts = + pferd = PFERD:main From fbebc46c580c02562f270d8bae23f28d2d8e540a Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 27 Apr 2021 12:41:49 +0200 Subject: [PATCH 062/524] Load and dump config --- .gitignore | 2 +- PFERD/__init__.py | 40 +++++++++++++++++- PFERD/config.py | 101 ++++++++++++++++++++++++++++++++++++++++++++++ PFERD/utils.py | 97 ++++++-------------------------------------- 4 files changed, 153 insertions(+), 87 deletions(-) create mode 100644 PFERD/config.py diff --git a/.gitignore b/.gitignore index 9ee2832..c888722 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/.mypy_cache/ +.mypy_cache/ /.venv/ /PFERD.egg-info/ __pycache__/ diff --git a/PFERD/__init__.py b/PFERD/__init__.py index b657171..7b3a3c1 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,2 +1,40 @@ +import argparse +from pathlib import Path + +from .config import Config, ConfigDumpException, ConfigLoadException + + def main() -> None: - print("Hello world") + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="specify custom config file path", + ) + parser.add_argument( + "--dump-config", + nargs="?", + const=True, + type=Path, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified", + ) + args = parser.parse_args() + + try: + config_parser = Config.load_parser(args.config) + config = Config(config_parser) + except ConfigLoadException: + exit(1) + + if args.dump_config: + path = None if args.dump_config is True else args.dump_config + try: + config.dump(path) + except ConfigDumpException: + exit(1) + exit() + + print(config) diff --git a/PFERD/config.py b/PFERD/config.py new file mode 100644 index 0000000..d71e4d1 --- /dev/null +++ b/PFERD/config.py @@ -0,0 +1,101 @@ +import configparser +import os +from pathlib import Path +from typing import Optional + +from .utils import prompt_yes_no + + +class ConfigLoadException(Exception): + pass + + +class ConfigDumpException(Exception): + pass + + +class Config: + @staticmethod + def _default_path() -> Path: + if os.name == "posix": + return Path("~/.config/PFERD/pferd.cfg").expanduser() + elif os.name == "nt": + return Path("~/AppData/Roaming/PFERD/pferd.cfg").expanduser() + else: + return Path("~/.pferd.cfg").expanduser() + + def __init__(self, parser: configparser.ConfigParser): + self._parser = parser + # TODO Load and validate config into dataclasses + + @staticmethod + def _fail_load(path: Path, reason: str) -> None: + print(f"Failed to load config file at {path}") + print(f"Reason: {reason}") + raise ConfigLoadException() + + @staticmethod + def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser: + """ + May throw a ConfigLoadException. + """ + + if not path: + path = Config._default_path() + + parser = configparser.ConfigParser() + + # Using config.read_file instead of config.read because config.read + # would just ignore a missing file and carry on. + try: + with open(path) as f: + parser.read_file(f, source=str(path)) + except FileNotFoundError: + Config._fail_load(path, "File does not exist") + except IsADirectoryError: + Config._fail_load(path, "That's a directory, not a file") + except PermissionError: + Config._fail_load(path, "Insufficient permissions") + + return parser + + @staticmethod + def _fail_dump(path: Path, reason: str) -> None: + print(f"Failed to dump config file to {path}") + print(f"Reason: {reason}") + raise ConfigDumpException() + + def dump(self, path: Optional[Path] = None) -> None: + """ + May throw a ConfigDumpException. + """ + + if not path: + path = self._default_path() + + print(f"Dumping config to {path}") + + try: + path.parent.mkdir(parents=True, exist_ok=True) + except PermissionError: + self._fail_dump(path, "Could not create parent directory") + + try: + # Ensuring we don't accidentally overwrite any existing files by + # always asking before overwriting a file. + try: + # x = open for exclusive creation, failing if the file already + # exists + with open(path, "x") as f: + self._parser.write(f) + except FileExistsError: + print("That file already exists.") + if prompt_yes_no("Overwrite it?", default=False): + with open(path, "w") as f: + self._parser.write(f) + else: + self._fail_dump(path, "File already exists") + except IsADirectoryError: + self._fail_dump(path, "That's a directory, not a file") + except PermissionError: + self._fail_dump(path, "Insufficient permissions") diff --git a/PFERD/utils.py b/PFERD/utils.py index 56c101a..4e1b5d7 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -1,98 +1,25 @@ -""" -A few utility bobs and bits. -""" - -import re -from pathlib import Path, PurePath -from typing import Optional, Tuple, Union - -import bs4 -import requests - -from .progress import ProgressSettings, progress_for, size_from_headers - -PathLike = Union[PurePath, str, Tuple[str, ...]] +from typing import Optional -def to_path(pathlike: PathLike) -> Path: +def prompt_yes_no(query: str, default: Optional[bool]) -> bool: """ - Convert a given PathLike into a Path. - """ - if isinstance(pathlike, tuple): - return Path(*pathlike) - return Path(pathlike) - - -Regex = Union[str, re.Pattern] - - -def to_pattern(regex: Regex) -> re.Pattern: - """ - Convert a regex to a re.Pattern. - """ - if isinstance(regex, re.Pattern): - return regex - return re.compile(regex) - - -def soupify(response: requests.Response) -> bs4.BeautifulSoup: - """ - Wrap a requests response in a bs4 object. - """ - - return bs4.BeautifulSoup(response.text, "html.parser") - - -def stream_to_path( - response: requests.Response, - target: Path, - progress_name: Optional[str] = None, - chunk_size: int = 1024 ** 2 -) -> None: - """ - Download a requests response content to a file by streaming it. This - function avoids excessive memory usage when downloading large files. The - chunk_size is in bytes. - - If progress_name is None, no progress bar will be shown. Otherwise a progress - bar will appear, if the download is bigger than an internal threshold. - """ - - with response: - length = size_from_headers(response) - if progress_name and length and int(length) > 1024 * 1024 * 10: # 10 MiB - settings: Optional[ProgressSettings] = ProgressSettings(progress_name, length) - else: - settings = None - - with open(target, 'wb') as file_descriptor: - with progress_for(settings) as progress: - for chunk in response.iter_content(chunk_size=chunk_size): - file_descriptor.write(chunk) - progress.advance(len(chunk)) - - -def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool: - """ - Prompts the user a yes/no question and returns their choice. + Asks the user a yes/no question and returns their choice. """ if default is True: - prompt = "[Y/n]" + query += " [Y/n] " elif default is False: - prompt = "[y/N]" + query += " [y/N] " else: - prompt = "[y/n]" - - text = f"{question} {prompt} " - wrong_reply = "Please reply with 'yes'/'y' or 'no'/'n'." + query += " [y/n] " while True: - response = input(text).strip().lower() - if response in {"yes", "ye", "y"}: + response = input(query).strip().lower() + if response == "y": return True - if response in {"no", "n"}: + elif response == "n": return False - if response == "" and default is not None: + elif response == "" and default is not None: return default - print(wrong_reply) + + print("Please answer with 'y' or 'n'.") From 3a74c23d0991d1ae340f71ec5e3d1ad9fb359916 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 09:51:25 +0200 Subject: [PATCH 063/524] Implement transformer --- PFERD/transformer.py | 238 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 PFERD/transformer.py diff --git a/PFERD/transformer.py b/PFERD/transformer.py new file mode 100644 index 0000000..1ecaf19 --- /dev/null +++ b/PFERD/transformer.py @@ -0,0 +1,238 @@ +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Optional, Union + + +class Rule(ABC): + @abstractmethod + def transform(self, path: Path) -> Optional[Path]: + pass + + +class NormalRule(Rule): + def __init__(self, left: Path, right: Path): + self._left = left + self._right = right + + def _match_prefix(self, path: Path) -> Optional[Path]: + left_parts = list(reversed(self._left.parts)) + path_parts = list(reversed(path.parts)) + + if len(left_parts) > len(path_parts): + return None + + while left_parts and path_parts: + left_part = left_parts.pop() + path_part = path_parts.pop() + + if left_part != path_part: + return None + + if left_parts: + return None + + return Path(*path_parts) + + def transform(self, path: Path) -> Optional[Path]: + if rest := self._match_prefix(path): + return self._right / rest + + return None + + +class ExactRule(Rule): + def __init__(self, left: Path, right: Path): + self._left = left + self._right = right + + def transform(self, path: Path) -> Optional[Path]: + if path == self._left: + return self._right + + return None + + +class ReRule(Rule): + def __init__(self, left: str, right: str): + self._left = left + self._right = right + + def transform(self, path: Path) -> Optional[Path]: + if match := re.fullmatch(self._left, str(path)): + kwargs: Dict[str, Union[int, float]] = {} + + groups = [match[0]] + list(match.groups()) + for i, group in enumerate(groups): + try: + kwargs[f"i{i}"] = int(group) + except ValueError: + pass + + try: + kwargs[f"f{i}"] = float(group) + except ValueError: + pass + + return Path(self._right.format(*groups, **kwargs)) + + return None + + +@dataclass +class RuleParseException(Exception): + line: "Line" + reason: str + + def pretty_print(self) -> None: + print(f"Error parsing rule on line {self.line.line_nr}:") + print(self.line.line) + spaces = " " * self.line.index + print(f"{spaces}^--- {self.reason}") + + +class Line: + def __init__(self, line: str, line_nr: int): + self._line = line + self._line_nr = line_nr + self._index = 0 + + def get(self) -> Optional[str]: + if self._index < len(self._line): + return self._line[self._index] + + return None + + @property + def line(self) -> str: + return self._line + + @property + def line_nr(self) -> str: + return self._line + + @property + def index(self) -> int: + return self._index + + @index.setter + def index(self, index: int) -> None: + self._index = index + + def advance(self) -> None: + self._index += 1 + + def expect(self, string: str) -> None: + for char in string: + if self.get() == char: + self.advance() + else: + raise RuleParseException(self, f"Expected {char!r}") + + +QUOTATION_MARKS = {'"', "'"} + + +def parse_string_literal(line: Line) -> str: + escaped = False + result = [] + + quotation_mark = line.get() + if quotation_mark not in QUOTATION_MARKS: + # This should never happen as long as this function is only called from + # parse_string. + raise RuleParseException(line, "Invalid quotation mark") + line.advance() + + while c := line.get(): + if escaped: + result.append(c) + escaped = False + line.advance() + elif c == quotation_mark: + line.advance() + return "".join(result) + elif c == "\\": + escaped = True + line.advance() + else: + result.append(c) + line.advance() + + raise RuleParseException(line, "Expected end of string literal") + + +def parse_until_space_or_eol(line: Line) -> str: + result = [] + while c := line.get(): + if c == " ": + break + result.append(c) + line.advance() + + return "".join(result) + + +def parse_string(line: Line) -> str: + if line.get() in QUOTATION_MARKS: + return parse_string_literal(line) + else: + return parse_until_space_or_eol(line) + + +def parse_arrow(line: Line) -> str: + line.expect("-") + + name = [] + while True: + if c := line.get(): + if c == "-": + break + else: + name.append(c) + line.advance() + else: + raise RuleParseException(line, "Expected rest of arrow") + + line.expect("->") + return "".join(name) + + +def parse_rule(line: Line) -> Rule: + left = parse_string(line) + line.expect(" ") + arrowindex = line.index + arrowname = parse_arrow(line) + line.expect(" ") + right = parse_string(line) + + if arrowname == "": + return NormalRule(Path(left), Path(right)) + elif arrowname == "exact": + return ExactRule(Path(left), Path(right)) + elif arrowname == "re": + return ReRule(left, right) + else: + line.index = arrowindex + 1 # For nicer error message + raise RuleParseException(line, "Invalid arrow name") + + +class Transformer: + def __init__(self, rules: str): + """ + May throw a RuleParseException. + """ + + self._rules = [] + for i, line in enumerate(rules.split("\n")): + line = line.strip() + if line: + self._rules.append(parse_rule(Line(line, i))) + + def transform(self, path: Path) -> Optional[Path]: + for rule in self._rules: + if result := rule.transform(path): + return result + + return None From b915e393dd28a3fe317eb552f211e9c0c0738a35 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 10:24:28 +0200 Subject: [PATCH 064/524] Implement limiter --- PFERD/limiter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 PFERD/limiter.py diff --git a/PFERD/limiter.py b/PFERD/limiter.py new file mode 100644 index 0000000..f73e2cd --- /dev/null +++ b/PFERD/limiter.py @@ -0,0 +1,19 @@ +import asyncio +from contextlib import AbstractAsyncContextManager, asynccontextmanager +from typing import AsyncIterator + + +class Limiter: + def __init__(self, limit: int = 10): + self._semaphore = asyncio.Semaphore(limit) + + @asynccontextmanager + async def _context_manager(self) -> AsyncIterator[None]: + await self._semaphore.acquire() + try: + yield + finally: + self._semaphore.release() + + def limit(self) -> AbstractAsyncContextManager[None]: + return self._context_manager() From a18db57e6fc0c92cb4dec2734bd393e05d6913d6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 11:25:00 +0200 Subject: [PATCH 065/524] Implement terminal conductor --- PFERD/conductor.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 2 ++ 2 files changed, 85 insertions(+) create mode 100644 PFERD/conductor.py diff --git a/PFERD/conductor.py b/PFERD/conductor.py new file mode 100644 index 0000000..bf41f61 --- /dev/null +++ b/PFERD/conductor.py @@ -0,0 +1,83 @@ +import asyncio +from contextlib import (AbstractAsyncContextManager, AbstractContextManager, + asynccontextmanager, contextmanager) +from pathlib import Path +from typing import AsyncIterator, Iterator, List, Optional + +import rich +from rich.markup import escape +from rich.progress import Progress, TaskID + + +class ProgressBar: + def __init__(self, progress: Progress, taskid: TaskID): + self._progress = progress + self._taskid = taskid + + def advance(self, amount: float = 1) -> None: + self._progress.advance(self._taskid, advance=amount) + + +class TerminalConductor: + def __init__(self) -> None: + self._stopped = False + self._lock = asyncio.Lock() + self._progress = Progress() + self._lines: List[str] = [] + + def _start(self) -> None: + for line in self._lines: + rich.print(line) + self._lines = [] + + self._progress.start() + + def _stop(self) -> None: + self._progress.stop() + self._stopped = True + + async def start(self) -> None: + with self._lock: + self._start() + + async def stop(self) -> None: + with self._lock: + self._stop() + + def print(self, line: str) -> None: + if self._stopped: + self._lines.append(line) + else: + rich.print(line) + + @asynccontextmanager + async def _exclusive_output_cm(self) -> AsyncIterator[None]: + async with self._lock: + self.stop() + try: + yield + finally: + self.start() + + def exclusive_output(self) -> AbstractAsyncContextManager[None]: + return self._exclusive_output_cm() + + @contextmanager + def _progress_bar_cm( + self, + description: str, + steps: Optional[float], + ) -> Iterator[ProgressBar]: + taskid = self._progress.add_task(description, steps=steps) + bar = ProgressBar(self._progress, taskid) + try: + yield bar + finally: + self._progress.remove_task(taskid) + + def progress_bar( + self, + description: Path, + steps: Optional[float], + ) -> AbstractContextManager[ProgressBar]: + return self._progress_bar_cm(escape(str(description)), steps=steps) diff --git a/setup.cfg b/setup.cfg index db60477..1c6e764 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,8 @@ version = 3.0.0 [options] packages = PFERD python_requires = >=3.8 +install_requires = + rich>=10.1.0 [options.entry_points] console_scripts = From 8da1ac6ceee7b37c0654193519d4ae780f4e9d72 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 11:25:13 +0200 Subject: [PATCH 066/524] Extend mypy config --- mypy.ini | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mypy.ini b/mypy.ini index 60306de..14509d6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,7 +1,11 @@ [mypy] +disallow_any_generics = True disallow_untyped_defs = True disallow_incomplete_defs = True no_implicit_optional = True +warn_unused_ignores = True +warn_unreachable = True +show_error_context = True [mypy-rich.*,bs4,keyring] ignore_missing_imports = True From c4fb92c6585f90b6a99a884c8a4859ce46d7f884 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:11:58 +0200 Subject: [PATCH 067/524] Make type hints compatible with Python 3.8 --- PFERD/conductor.py | 11 ++++++----- PFERD/limiter.py | 7 ++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index bf41f61..86df7e4 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -1,8 +1,9 @@ import asyncio -from contextlib import (AbstractAsyncContextManager, AbstractContextManager, - asynccontextmanager, contextmanager) +from contextlib import asynccontextmanager, contextmanager from pathlib import Path -from typing import AsyncIterator, Iterator, List, Optional +# TODO If we upgrade to python 3.9, these context manager hints are deprecated +from typing import (AsyncContextManager, AsyncIterator, ContextManager, + Iterator, List, Optional) import rich from rich.markup import escape @@ -59,7 +60,7 @@ class TerminalConductor: finally: self.start() - def exclusive_output(self) -> AbstractAsyncContextManager[None]: + def exclusive_output(self) -> AsyncContextManager[None]: return self._exclusive_output_cm() @contextmanager @@ -79,5 +80,5 @@ class TerminalConductor: self, description: Path, steps: Optional[float], - ) -> AbstractContextManager[ProgressBar]: + ) -> ContextManager[ProgressBar]: return self._progress_bar_cm(escape(str(description)), steps=steps) diff --git a/PFERD/limiter.py b/PFERD/limiter.py index f73e2cd..f001d8b 100644 --- a/PFERD/limiter.py +++ b/PFERD/limiter.py @@ -1,6 +1,7 @@ import asyncio -from contextlib import AbstractAsyncContextManager, asynccontextmanager -from typing import AsyncIterator +from contextlib import asynccontextmanager +# TODO If we upgrade to python 3.9, this context manager hint is deprecated +from typing import AsyncContextManager, AsyncIterator class Limiter: @@ -15,5 +16,5 @@ class Limiter: finally: self._semaphore.release() - def limit(self) -> AbstractAsyncContextManager[None]: + def limit(self) -> AsyncContextManager[None]: return self._context_manager() From 7e127cd5cce37b6c0f6cd8b0139ae9d13cb69e07 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:43:50 +0200 Subject: [PATCH 068/524] Clean up and fix conductor and limiter Turns out you have to await an async lock, who knew... --- PFERD/conductor.py | 33 ++++++++++++--------------------- PFERD/limiter.py | 8 ++------ 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 86df7e4..fef5a0e 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -1,12 +1,8 @@ import asyncio from contextlib import asynccontextmanager, contextmanager -from pathlib import Path -# TODO If we upgrade to python 3.9, these context manager hints are deprecated -from typing import (AsyncContextManager, AsyncIterator, ContextManager, - Iterator, List, Optional) +from typing import AsyncIterator, Iterator, List, Optional import rich -from rich.markup import escape from rich.progress import Progress, TaskID @@ -38,11 +34,11 @@ class TerminalConductor: self._stopped = True async def start(self) -> None: - with self._lock: + async with self._lock: self._start() async def stop(self) -> None: - with self._lock: + async with self._lock: self._stop() def print(self, line: str) -> None: @@ -52,7 +48,7 @@ class TerminalConductor: rich.print(line) @asynccontextmanager - async def _exclusive_output_cm(self) -> AsyncIterator[None]: + async def exclusive_output(self) -> AsyncIterator[None]: async with self._lock: self.stop() try: @@ -60,25 +56,20 @@ class TerminalConductor: finally: self.start() - def exclusive_output(self) -> AsyncContextManager[None]: - return self._exclusive_output_cm() - @contextmanager - def _progress_bar_cm( + def progress_bar( self, description: str, - steps: Optional[float], + total: Optional[float] = None, ) -> Iterator[ProgressBar]: - taskid = self._progress.add_task(description, steps=steps) + if total is None: + # Indeterminate progress bar + taskid = self._progress.add_task(description, start=False) + else: + taskid = self._progress.add_task(description, total=total) + bar = ProgressBar(self._progress, taskid) try: yield bar finally: self._progress.remove_task(taskid) - - def progress_bar( - self, - description: Path, - steps: Optional[float], - ) -> ContextManager[ProgressBar]: - return self._progress_bar_cm(escape(str(description)), steps=steps) diff --git a/PFERD/limiter.py b/PFERD/limiter.py index f001d8b..ff91d57 100644 --- a/PFERD/limiter.py +++ b/PFERD/limiter.py @@ -1,7 +1,6 @@ import asyncio from contextlib import asynccontextmanager -# TODO If we upgrade to python 3.9, this context manager hint is deprecated -from typing import AsyncContextManager, AsyncIterator +from typing import AsyncIterator class Limiter: @@ -9,12 +8,9 @@ class Limiter: self._semaphore = asyncio.Semaphore(limit) @asynccontextmanager - async def _context_manager(self) -> AsyncIterator[None]: + async def limit(self) -> AsyncIterator[None]: await self._semaphore.acquire() try: yield finally: self._semaphore.release() - - def limit(self) -> AsyncContextManager[None]: - return self._context_manager() From bbc792f9fb7de4459da1fdaa55f24ea292333981 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:44:29 +0200 Subject: [PATCH 069/524] Implement Crawler and DummyCrawler --- PFERD/crawler.py | 60 ++++++++++++++++++++++++++++++++++++++ PFERD/crawlers/__init__.py | 5 ++++ PFERD/crawlers/dummy.py | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 PFERD/crawler.py create mode 100644 PFERD/crawlers/__init__.py create mode 100644 PFERD/crawlers/dummy.py diff --git a/PFERD/crawler.py b/PFERD/crawler.py new file mode 100644 index 0000000..9f1c7d9 --- /dev/null +++ b/PFERD/crawler.py @@ -0,0 +1,60 @@ +import configparser +from abc import ABC, abstractmethod +from contextlib import asynccontextmanager +from pathlib import Path +from typing import AsyncIterator, Optional + +from rich.markup import escape + +from .conductor import ProgressBar, TerminalConductor +from .limiter import Limiter +from .transformer import RuleParseException, Transformer + + +class CrawlerLoadException(Exception): + pass + + +class Crawler(ABC): + def __init__(self, name: str, section: configparser.SectionProxy) -> None: + """ + May throw a CrawlerLoadException. + """ + + self.name = name + + self._conductor = TerminalConductor() + self._limiter = Limiter() + + try: + self._transformer = Transformer(section.get("transform", "")) + except RuleParseException as e: + e.pretty_print() + raise CrawlerLoadException() + + # output_dir = Path(section.get("output_dir", name)) + + def print(self, text: str) -> None: + self._conductor.print(text) + + @asynccontextmanager + async def progress_bar( + self, + path: Path, + total: Optional[int] = None, + ) -> AsyncIterator[ProgressBar]: + desc = escape(str(path)) + async with self._limiter.limit(): + with self._conductor.progress_bar(desc, total=total) as bar: + yield bar + + async def run(self) -> None: + await self._conductor.start() + try: + await self.crawl() + finally: + await self._conductor.stop() + + @abstractmethod + async def crawl(self) -> None: + pass diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py new file mode 100644 index 0000000..5248a2d --- /dev/null +++ b/PFERD/crawlers/__init__.py @@ -0,0 +1,5 @@ +from .dummy import DummyCrawler + +CRAWLERS = { + "dummy": DummyCrawler, +} diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py new file mode 100644 index 0000000..b4d787a --- /dev/null +++ b/PFERD/crawlers/dummy.py @@ -0,0 +1,53 @@ +import asyncio +import random +from pathlib import Path +from typing import Any + +from rich.markup import escape + +from ..crawler import Crawler + +DUMMY_TREE = { + "Blätter": { + "Blatt_01.pdf": (), + "Blatt_02.pdf": (), + "Blatt_03.pdf": (), + "Blatt_04.pdf": (), + "Blatt_05.pdf": (), + "Blatt_01_Lösung.pdf": (), + "Blatt_02_Lösung.pdf": (), + "Blatt_03_Lösung.pdf": (), + "Blatt_04_Lösung.pdf": (), + "Blatt_05_Lösung.pdf": (), + }, + "Vorlesungsfolien": { + "VL_01.pdf": (), + "VL_02.pdf": (), + "VL_03.pdf": (), + "VL_04.pdf": (), + "VL_05.pdf": (), + }, + "noch_mehr.txt": (), + "dateien.jar": (), +} + + +class DummyCrawler(Crawler): + async def crawl(self) -> None: + await self._crawl_entry(Path(), DUMMY_TREE) + + async def _crawl_entry(self, path: Path, value: Any) -> None: + if value == (): + n = random.randint(5, 20) + async with self.progress_bar(path, n) as bar: + await asyncio.sleep(random.random() / 2) + for i in range(n): + await asyncio.sleep(0.5) + bar.advance() + self.print(f"[green]Downloaded {escape(str(path))}") + else: + t = random.random() * 2 + 1 + async with self.progress_bar(path) as bar: + await asyncio.sleep(t) + tasks = [self._crawl_entry(path / k, v) for k, v in value.items()] + await asyncio.gather(*tasks) From 3ea86d18a0be532e53f62c54425c9bc9814b0ead Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:45:04 +0200 Subject: [PATCH 070/524] Jerry-rig DummyCrawler to run --- PFERD/__init__.py | 5 +- PFERD/pferd.py | 446 +--------------------------------------------- 2 files changed, 13 insertions(+), 438 deletions(-) diff --git a/PFERD/__init__.py b/PFERD/__init__.py index 7b3a3c1..a16b19b 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,7 +1,9 @@ import argparse +import asyncio from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException +from .pferd import Pferd def main() -> None: @@ -37,4 +39,5 @@ def main() -> None: exit(1) exit() - print(config) + pferd = Pferd(config) + asyncio.run(pferd.run()) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 1bb6f78..d145ade 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,440 +1,12 @@ -""" -Convenience functions for using PFERD. -""" +from .config import Config +from .crawlers import CRAWLERS -import logging -from pathlib import Path -from typing import Callable, List, Optional, Union -from .authenticators import UserPassAuthenticator -from .cookie_jar import CookieJar -from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler, - diva_download_new) -from .download_summary import DownloadSummary -from .errors import FatalException, swallow_and_print_errors -from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter, - IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy, - KitShibbolethAuthenticator, download_modified_or_new) -from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo, - IpdDownloadStrategy, ipd_download_new_or_modified) -from .location import Location -from .logging import PrettyLogger, enable_logging -from .organizer import FileConflictResolver, Organizer, resolve_prompt_user -from .tmp_dir import TmpDir -from .transform import TF, Transform, apply_transform -from .utils import PathLike, to_path +class Pferd: + def __init__(self, config: Config): + self._config = config -# TODO save known-good cookies as soon as possible - - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -class Pferd(Location): - # pylint: disable=too-many-arguments - """ - The main entrypoint in your Pferd usage: This class combines a number of - useful shortcuts for running synchronizers in a single interface. - """ - - def __init__( - self, - base_dir: Path, - tmp_dir: Path = Path(".tmp"), - test_run: bool = False - ): - super().__init__(Path(base_dir)) - - self._download_summary = DownloadSummary() - self._tmp_dir = TmpDir(self.resolve(tmp_dir)) - self._test_run = test_run - - @staticmethod - def enable_logging() -> None: - """ - Enable and configure logging via the logging module. - """ - - enable_logging() - - @staticmethod - def _print_transformables(transformables: List[TF]) -> None: - LOGGER.info("") - LOGGER.info("Results of the test run:") - for transformable in transformables: - LOGGER.info(transformable.path) - - @staticmethod - def _get_authenticator( - username: Optional[str], password: Optional[str] - ) -> KitShibbolethAuthenticator: - inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password) - return KitShibbolethAuthenticator(inner_auth) - - def _ilias( - self, - target: PathLike, - base_url: str, - crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]], - authenticator: IliasAuthenticator, - cookies: Optional[PathLike], - dir_filter: IliasDirectoryFilter, - transform: Transform, - download_strategy: IliasDownloadStrategy, - timeout: int, - clean: bool = True, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - # pylint: disable=too-many-locals - cookie_jar = CookieJar(to_path(cookies) if cookies else None) - session = cookie_jar.create_session() - tmp_dir = self._tmp_dir.new_subdir() - organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) - - crawler = IliasCrawler(base_url, session, authenticator, dir_filter) - downloader = IliasDownloader(tmp_dir, organizer, session, - authenticator, download_strategy, timeout) - - cookie_jar.load_cookies() - info = crawl_function(crawler) - cookie_jar.save_cookies() - - transformed = apply_transform(transform, info) - if self._test_run: - self._print_transformables(transformed) - return organizer - - downloader.download_all(transformed) - cookie_jar.save_cookies() - - if clean: - organizer.cleanup() - - return organizer - - @swallow_and_print_errors - def ilias_kit( - self, - target: PathLike, - course_id: str, - dir_filter: IliasDirectoryFilter = lambda x, y: True, - transform: Transform = lambda x: x, - cookies: Optional[PathLike] = None, - username: Optional[str] = None, - password: Optional[str] = None, - download_strategy: IliasDownloadStrategy = download_modified_or_new, - clean: bool = True, - timeout: int = 5, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - """ - Synchronizes a folder with the ILIAS instance of the KIT. - - Arguments: - target {Path} -- the target path to write the data to - course_id {str} -- the id of the main course page (found in the URL after ref_id - when opening the course homepage) - - Keyword Arguments: - dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the - crawler level, these directories and all of their content is skipped. - (default: {lambdax:True}) - transform {Transform} -- A transformation function for the output paths. Return None - to ignore a file. (default: {lambdax:x}) - cookies {Optional[Path]} -- The path to store and load cookies from. - (default: {None}) - username {Optional[str]} -- The SCC username. If none is given, it will prompt - the user. (default: {None}) - password {Optional[str]} -- The SCC password. If none is given, it will prompt - the user. (default: {None}) - download_strategy {DownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. - (default: {download_modified_or_new}) - clean {bool} -- Whether to clean up when the method finishes. - timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. - file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal - with overwriting or deleting files. The default always asks the user. - """ - # This authenticator only works with the KIT ilias instance. - authenticator = Pferd._get_authenticator(username=username, password=password) - PRETTY.starting_synchronizer(target, "ILIAS", course_id) - - organizer = self._ilias( - target=target, - base_url="https://ilias.studium.kit.edu/", - crawl_function=lambda crawler: crawler.crawl_course(course_id), - authenticator=authenticator, - cookies=cookies, - dir_filter=dir_filter, - transform=transform, - download_strategy=download_strategy, - clean=clean, - timeout=timeout, - file_conflict_resolver=file_conflict_resolver - ) - - self._download_summary.merge(organizer.download_summary) - - return organizer - - def print_summary(self) -> None: - """ - Prints the accumulated download summary. - """ - PRETTY.summary(self._download_summary) - - @swallow_and_print_errors - def ilias_kit_personal_desktop( - self, - target: PathLike, - dir_filter: IliasDirectoryFilter = lambda x, y: True, - transform: Transform = lambda x: x, - cookies: Optional[PathLike] = None, - username: Optional[str] = None, - password: Optional[str] = None, - download_strategy: IliasDownloadStrategy = download_modified_or_new, - clean: bool = True, - timeout: int = 5, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - """ - Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS - "personal desktop" instead of a single course. - - Arguments: - target {Path} -- the target path to write the data to - - Keyword Arguments: - dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the - crawler level, these directories and all of their content is skipped. - (default: {lambdax:True}) - transform {Transform} -- A transformation function for the output paths. Return None - to ignore a file. (default: {lambdax:x}) - cookies {Optional[Path]} -- The path to store and load cookies from. - (default: {None}) - username {Optional[str]} -- The SCC username. If none is given, it will prompt - the user. (default: {None}) - password {Optional[str]} -- The SCC password. If none is given, it will prompt - the user. (default: {None}) - download_strategy {DownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. - (default: {download_modified_or_new}) - clean {bool} -- Whether to clean up when the method finishes. - timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. - file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal - with overwriting or deleting files. The default always asks the user. - """ - # This authenticator only works with the KIT ilias instance. - authenticator = Pferd._get_authenticator(username, password) - PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") - - organizer = self._ilias( - target=target, - base_url="https://ilias.studium.kit.edu/", - crawl_function=lambda crawler: crawler.crawl_personal_desktop(), - authenticator=authenticator, - cookies=cookies, - dir_filter=dir_filter, - transform=transform, - download_strategy=download_strategy, - clean=clean, - timeout=timeout, - file_conflict_resolver=file_conflict_resolver - ) - - self._download_summary.merge(organizer.download_summary) - - return organizer - - @swallow_and_print_errors - def ilias_kit_folder( - self, - target: PathLike, - full_url: str, - dir_filter: IliasDirectoryFilter = lambda x, y: True, - transform: Transform = lambda x: x, - cookies: Optional[PathLike] = None, - username: Optional[str] = None, - password: Optional[str] = None, - download_strategy: IliasDownloadStrategy = download_modified_or_new, - clean: bool = True, - timeout: int = 5, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - """ - Synchronizes a folder with a given folder on the ILIAS instance of the KIT. - - Arguments: - target {Path} -- the target path to write the data to - full_url {str} -- the full url of the folder/videos/course to crawl - - Keyword Arguments: - dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the - crawler level, these directories and all of their content is skipped. - (default: {lambdax:True}) - transform {Transform} -- A transformation function for the output paths. Return None - to ignore a file. (default: {lambdax:x}) - cookies {Optional[Path]} -- The path to store and load cookies from. - (default: {None}) - username {Optional[str]} -- The SCC username. If none is given, it will prompt - the user. (default: {None}) - password {Optional[str]} -- The SCC password. If none is given, it will prompt - the user. (default: {None}) - download_strategy {DownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. - (default: {download_modified_or_new}) - clean {bool} -- Whether to clean up when the method finishes. - timeout {int} -- The download timeout for opencast videos. Sadly needed due to a - requests bug. - file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal - with overwriting or deleting files. The default always asks the user. - """ - # This authenticator only works with the KIT ilias instance. - authenticator = Pferd._get_authenticator(username=username, password=password) - PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url") - - if not full_url.startswith("https://ilias.studium.kit.edu"): - raise FatalException("Not a valid KIT ILIAS URL") - - organizer = self._ilias( - target=target, - base_url="https://ilias.studium.kit.edu/", - crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url), - authenticator=authenticator, - cookies=cookies, - dir_filter=dir_filter, - transform=transform, - download_strategy=download_strategy, - clean=clean, - timeout=timeout, - file_conflict_resolver=file_conflict_resolver - ) - - self._download_summary.merge(organizer.download_summary) - - return organizer - - @swallow_and_print_errors - def ipd_kit( - self, - target: Union[PathLike, Organizer], - url: str, - transform: Transform = lambda x: x, - download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified, - clean: bool = True, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - """ - Synchronizes a folder with a DIVA playlist. - - Arguments: - target {Union[PathLike, Organizer]} -- The organizer / target folder to use. - url {str} -- the url to the page - - Keyword Arguments: - transform {Transform} -- A transformation function for the output paths. Return None - to ignore a file. (default: {lambdax:x}) - download_strategy {DivaDownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. - (default: {diva_download_new}) - clean {bool} -- Whether to clean up when the method finishes. - file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal - with overwriting or deleting files. The default always asks the user. - """ - tmp_dir = self._tmp_dir.new_subdir() - - if target is None: - PRETTY.starting_synchronizer("None", "IPD", url) - raise FatalException("Got 'None' as target directory, aborting") - - if isinstance(target, Organizer): - organizer = target - else: - organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) - - PRETTY.starting_synchronizer(organizer.path, "IPD", url) - - elements: List[IpdDownloadInfo] = IpdCrawler(url).crawl() - transformed = apply_transform(transform, elements) - - if self._test_run: - self._print_transformables(transformed) - return organizer - - downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy) - downloader.download_all(transformed) - - if clean: - organizer.cleanup() - - self._download_summary.merge(organizer.download_summary) - - return organizer - - @swallow_and_print_errors - def diva_kit( - self, - target: Union[PathLike, Organizer], - playlist_location: str, - transform: Transform = lambda x: x, - download_strategy: DivaDownloadStrategy = diva_download_new, - clean: bool = True, - file_conflict_resolver: FileConflictResolver = resolve_prompt_user - ) -> Organizer: - """ - Synchronizes a folder with a DIVA playlist. - - Arguments: - organizer {Organizer} -- The organizer to use. - playlist_location {str} -- the playlist id or the playlist URL - in the format 'https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271' - - Keyword Arguments: - transform {Transform} -- A transformation function for the output paths. Return None - to ignore a file. (default: {lambdax:x}) - download_strategy {DivaDownloadStrategy} -- A function to determine which files need to - be downloaded. Can save bandwidth and reduce the number of requests. - (default: {diva_download_new}) - clean {bool} -- Whether to clean up when the method finishes. - file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal - with overwriting or deleting files. The default always asks the user. - """ - tmp_dir = self._tmp_dir.new_subdir() - - if playlist_location.startswith("http"): - playlist_id = DivaPlaylistCrawler.fetch_id(playlist_link=playlist_location) - else: - playlist_id = playlist_location - - if target is None: - PRETTY.starting_synchronizer("None", "DIVA", playlist_id) - raise FatalException("Got 'None' as target directory, aborting") - - if isinstance(target, Organizer): - organizer = target - else: - organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) - - PRETTY.starting_synchronizer(organizer.path, "DIVA", playlist_id) - - crawler = DivaPlaylistCrawler(playlist_id) - downloader = DivaDownloader(tmp_dir, organizer, download_strategy) - - info = crawler.crawl() - - transformed = apply_transform(transform, info) - if self._test_run: - self._print_transformables(transformed) - return organizer - - downloader.download_all(transformed) - - if clean: - organizer.cleanup() - - self._download_summary.merge(organizer.download_summary) - - return organizer + async def run(self) -> None: + print("Bleep bloop 1") + await CRAWLERS["dummy"]("dummy", self._config._parser["dummy"]).run() + print("Bleep bloop 2") From ac3bfd7388af31c8feab2c3835a382f23c498034 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 13:53:16 +0200 Subject: [PATCH 071/524] Make progress bars easier to use The crawler now supports two types of progress bars --- PFERD/crawler.py | 20 +++++++++++++++++--- PFERD/crawlers/dummy.py | 16 +++++++++------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 9f1c7d9..0092744 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -2,7 +2,8 @@ import configparser from abc import ABC, abstractmethod from contextlib import asynccontextmanager from pathlib import Path -from typing import AsyncIterator, Optional +# TODO In Python 3.9 and above, AsyncContextManager is deprecated +from typing import AsyncContextManager, AsyncIterator, Optional from rich.markup import escape @@ -40,14 +41,27 @@ class Crawler(ABC): @asynccontextmanager async def progress_bar( self, - path: Path, + desc: str, total: Optional[int] = None, ) -> AsyncIterator[ProgressBar]: - desc = escape(str(path)) async with self._limiter.limit(): with self._conductor.progress_bar(desc, total=total) as bar: yield bar + def crawl_bar(self, path: Path) -> AsyncContextManager[ProgressBar]: + path = escape(str(path)) + desc = f"[bold magenta]Crawling[/bold magenta] {path}" + return self.progress_bar(desc) + + def download_bar( + self, + path: Path, + size: int, + ) -> AsyncContextManager[ProgressBar]: + path = escape(str(path)) + desc = f"[bold green]Downloading[/bold green] {path}" + return self.progress_bar(desc, total=size) + async def run(self) -> None: await self._conductor.start() try: diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py index b4d787a..a88216b 100644 --- a/PFERD/crawlers/dummy.py +++ b/PFERD/crawlers/dummy.py @@ -14,11 +14,13 @@ DUMMY_TREE = { "Blatt_03.pdf": (), "Blatt_04.pdf": (), "Blatt_05.pdf": (), - "Blatt_01_Lösung.pdf": (), - "Blatt_02_Lösung.pdf": (), - "Blatt_03_Lösung.pdf": (), - "Blatt_04_Lösung.pdf": (), - "Blatt_05_Lösung.pdf": (), + "Lösungen": { + "Blatt_01_Lösung.pdf": (), + "Blatt_02_Lösung.pdf": (), + "Blatt_03_Lösung.pdf": (), + "Blatt_04_Lösung.pdf": (), + "Blatt_05_Lösung.pdf": (), + }, }, "Vorlesungsfolien": { "VL_01.pdf": (), @@ -39,7 +41,7 @@ class DummyCrawler(Crawler): async def _crawl_entry(self, path: Path, value: Any) -> None: if value == (): n = random.randint(5, 20) - async with self.progress_bar(path, n) as bar: + async with self.download_bar(path, n) as bar: await asyncio.sleep(random.random() / 2) for i in range(n): await asyncio.sleep(0.5) @@ -47,7 +49,7 @@ class DummyCrawler(Crawler): self.print(f"[green]Downloaded {escape(str(path))}") else: t = random.random() * 2 + 1 - async with self.progress_bar(path) as bar: + async with self.crawl_bar(path) as bar: await asyncio.sleep(t) tasks = [self._crawl_entry(path / k, v) for k, v in value.items()] await asyncio.gather(*tasks) From 6431a3fb3db070ddbe5a1ce286e9b375f72b82ad Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 14:23:09 +0200 Subject: [PATCH 072/524] Fix some mypy errors --- PFERD/crawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 0092744..31aab5b 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -49,8 +49,8 @@ class Crawler(ABC): yield bar def crawl_bar(self, path: Path) -> AsyncContextManager[ProgressBar]: - path = escape(str(path)) - desc = f"[bold magenta]Crawling[/bold magenta] {path}" + pathstr = escape(str(path)) + desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}" return self.progress_bar(desc) def download_bar( @@ -58,8 +58,8 @@ class Crawler(ABC): path: Path, size: int, ) -> AsyncContextManager[ProgressBar]: - path = escape(str(path)) - desc = f"[bold green]Downloading[/bold green] {path}" + pathstr = escape(str(path)) + desc = f"[bold green]Downloading[/bold green] {pathstr}" return self.progress_bar(desc, total=size) async def run(self) -> None: From 2e85d26b6bbb6a392e4123080f3cb9f74a40f0d7 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 14:23:28 +0200 Subject: [PATCH 073/524] Use conductor via context manager --- PFERD/conductor.py | 43 +++++++++++++++++++++++++------------------ PFERD/crawler.py | 5 +---- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index fef5a0e..121ed9a 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -1,6 +1,7 @@ import asyncio from contextlib import asynccontextmanager, contextmanager -from typing import AsyncIterator, Iterator, List, Optional +from types import TracebackType +from typing import AsyncIterator, Iterator, List, Optional, Type import rich from rich.progress import Progress, TaskID @@ -22,24 +23,30 @@ class TerminalConductor: self._progress = Progress() self._lines: List[str] = [] - def _start(self) -> None: - for line in self._lines: - rich.print(line) - self._lines = [] - - self._progress.start() - - def _stop(self) -> None: - self._progress.stop() - self._stopped = True - - async def start(self) -> None: + async def _start(self) -> None: async with self._lock: - self._start() + for line in self._lines: + rich.print(line) + self._lines = [] - async def stop(self) -> None: + self._progress.start() + + async def _stop(self) -> None: async with self._lock: - self._stop() + self._progress.stop() + self._stopped = True + + async def __aenter__(self) -> None: + await self._start() + + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> Optional[bool]: + await self._stop() + return None def print(self, line: str) -> None: if self._stopped: @@ -50,11 +57,11 @@ class TerminalConductor: @asynccontextmanager async def exclusive_output(self) -> AsyncIterator[None]: async with self._lock: - self.stop() + self._stop() try: yield finally: - self.start() + self._start() @contextmanager def progress_bar( diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 31aab5b..093ba91 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -63,11 +63,8 @@ class Crawler(ABC): return self.progress_bar(desc, total=size) async def run(self) -> None: - await self._conductor.start() - try: + async with self._conductor: await self.crawl() - finally: - await self._conductor.stop() @abstractmethod async def crawl(self) -> None: From d96a361325ed1c16bbfd9af725484a6470d5de49 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 15:26:10 +0200 Subject: [PATCH 074/524] Test and fix exclusive output --- PFERD/conductor.py | 34 ++++++++++++++++++++-------------- PFERD/crawler.py | 3 +++ PFERD/crawlers/dummy.py | 8 ++++++-- PFERD/utils.py | 29 ++++++++++++++++++++++++++--- 4 files changed, 55 insertions(+), 19 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 121ed9a..161a287 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -3,7 +3,6 @@ from contextlib import asynccontextmanager, contextmanager from types import TracebackType from typing import AsyncIterator, Iterator, List, Optional, Type -import rich from rich.progress import Progress, TaskID @@ -24,20 +23,26 @@ class TerminalConductor: self._lines: List[str] = [] async def _start(self) -> None: - async with self._lock: - for line in self._lines: - rich.print(line) - self._lines = [] + for task in self._progress.tasks: + task.visible = True + self._progress.start() - self._progress.start() + self._stopped = False + + for line in self._lines: + self.print(line) + self._lines = [] async def _stop(self) -> None: - async with self._lock: - self._progress.stop() - self._stopped = True + self._stopped = True + + for task in self._progress.tasks: + task.visible = False + self._progress.stop() async def __aenter__(self) -> None: - await self._start() + async with self._lock: + await self._start() async def __aexit__( self, @@ -45,23 +50,24 @@ class TerminalConductor: exc_value: Optional[BaseException], traceback: Optional[TracebackType], ) -> Optional[bool]: - await self._stop() + async with self._lock: + await self._stop() return None def print(self, line: str) -> None: if self._stopped: self._lines.append(line) else: - rich.print(line) + self._progress.console.print(line) @asynccontextmanager async def exclusive_output(self) -> AsyncIterator[None]: async with self._lock: - self._stop() + await self._stop() try: yield finally: - self._start() + await self._start() @contextmanager def progress_bar( diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 093ba91..6326b90 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -38,6 +38,9 @@ class Crawler(ABC): def print(self, text: str) -> None: self._conductor.print(text) + def exclusive_output(self): + return self._conductor.exclusive_output() + @asynccontextmanager async def progress_bar( self, diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py index a88216b..46a7a69 100644 --- a/PFERD/crawlers/dummy.py +++ b/PFERD/crawlers/dummy.py @@ -6,6 +6,7 @@ from typing import Any from rich.markup import escape from ..crawler import Crawler +from ..utils import ainput DUMMY_TREE = { "Blätter": { @@ -17,7 +18,7 @@ DUMMY_TREE = { "Lösungen": { "Blatt_01_Lösung.pdf": (), "Blatt_02_Lösung.pdf": (), - "Blatt_03_Lösung.pdf": (), + "Blatt_03_Lösung.pdf": True, "Blatt_04_Lösung.pdf": (), "Blatt_05_Lösung.pdf": (), }, @@ -39,7 +40,10 @@ class DummyCrawler(Crawler): await self._crawl_entry(Path(), DUMMY_TREE) async def _crawl_entry(self, path: Path, value: Any) -> None: - if value == (): + if value is True: + async with self.exclusive_output(): + await ainput(f"File {path}, please press enter: ") + if value == () or value is True: n = random.randint(5, 20) async with self.download_bar(path, n) as bar: await asyncio.sleep(random.random() / 2) diff --git a/PFERD/utils.py b/PFERD/utils.py index 4e1b5d7..3808f1d 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -1,7 +1,30 @@ -from typing import Optional +import functools +import contextvars +import asyncio +import getpass +from typing import Any, Callable, Optional, TypeVar + +T = TypeVar("T") -def prompt_yes_no(query: str, default: Optional[bool]) -> bool: +# TODO When switching to 3.9, use asyncio.to_thread instead of this +async def to_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: + # https://github.com/python/cpython/blob/8d47f92d46a92a5931b8f3dcb4a484df672fc4de/Lib/asyncio/threads.py + loop = asyncio.get_event_loop() + ctx = contextvars.copy_context() + func_call = functools.partial(ctx.run, func, *args, **kwargs) + return await loop.run_in_executor(None, func_call) + + +async def ainput(prompt: Optional[str] = None) -> str: + return await to_thread(lambda: input(prompt)) + + +async def agetpass(prompt: Optional[str] = None) -> str: + return await to_thread(lambda: getpass.getpass(prompt)) + + +async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: """ Asks the user a yes/no question and returns their choice. """ @@ -14,7 +37,7 @@ def prompt_yes_no(query: str, default: Optional[bool]) -> bool: query += " [y/n] " while True: - response = input(query).strip().lower() + response = (await ainput(query)).strip().lower() if response == "y": return True elif response == "n": From d2103d7c44f6d342cd9b6a1829a4da3f1adaf240 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 15:43:20 +0200 Subject: [PATCH 075/524] Document crawler --- PFERD/crawler.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 6326b90..36c528d 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -19,6 +19,11 @@ class CrawlerLoadException(Exception): class Crawler(ABC): def __init__(self, name: str, section: configparser.SectionProxy) -> None: """ + Initialize a crawler from its name and its section in the config file. + + If you are writing your own constructor for your own crawler, make sure + to call this constructor first (via super().__init__). + May throw a CrawlerLoadException. """ @@ -36,9 +41,28 @@ class Crawler(ABC): # output_dir = Path(section.get("output_dir", name)) def print(self, text: str) -> None: + """ + Print rich markup to the terminal. Crawlers *must* use this function to + print things unless they are holding an exclusive output context + manager! Be careful to escape all user-supplied strings. + """ + self._conductor.print(text) def exclusive_output(self): + """ + Acquire exclusive rights™ to the terminal output. While this context + manager is held, output such as printing and progress bars from other + threads is suspended and the current thread may do whatever it wants + with the terminal. However, it must return the terminal to its original + state before exiting the context manager. + + No two threads can hold this context manager at the same time. + + Useful for password or confirmation prompts as well as running other + programs while crawling (e. g. to get certain credentials). + """ + return self._conductor.exclusive_output() @asynccontextmanager @@ -66,9 +90,21 @@ class Crawler(ABC): return self.progress_bar(desc, total=size) async def run(self) -> None: + """ + Start the crawling process. Call this function if you want to use a + crawler. + """ + async with self._conductor: await self.crawl() @abstractmethod async def crawl(self) -> None: + """ + Overwrite this function if you are writing a crawler. + + This function must not return before all crawling is complete. To crawl + multiple things concurrently, asyncio.gather can be used. + """ + pass From 502654d8535ed4be4bd93f978232117dd5e210fd Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 15:47:52 +0200 Subject: [PATCH 076/524] Fix mypy errors --- PFERD/crawler.py | 2 +- PFERD/utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 36c528d..376cada 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -49,7 +49,7 @@ class Crawler(ABC): self._conductor.print(text) - def exclusive_output(self): + def exclusive_output(self) -> AsyncContextManager[None]: """ Acquire exclusive rights™ to the terminal output. While this context manager is held, output such as printing and progress bars from other diff --git a/PFERD/utils.py b/PFERD/utils.py index 3808f1d..08017aa 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -1,6 +1,6 @@ -import functools -import contextvars import asyncio +import contextvars +import functools import getpass from typing import Any, Callable, Optional, TypeVar @@ -13,14 +13,14 @@ async def to_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: loop = asyncio.get_event_loop() ctx = contextvars.copy_context() func_call = functools.partial(ctx.run, func, *args, **kwargs) - return await loop.run_in_executor(None, func_call) + return await loop.run_in_executor(None, func_call) # type: ignore -async def ainput(prompt: Optional[str] = None) -> str: +async def ainput(prompt: str) -> str: return await to_thread(lambda: input(prompt)) -async def agetpass(prompt: Optional[str] = None) -> str: +async def agetpass(prompt: str) -> str: return await to_thread(lambda: getpass.getpass(prompt)) From 20a24dbcbf747fd82a6e8202d03429534504c799 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 16:14:50 +0200 Subject: [PATCH 077/524] Add changelog --- CHANGELOG.md | 20 ++++++++++++++++++++ README.md | 1 + 2 files changed, 21 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..14966d7 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,20 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Support for concurrent downloads +- Support for proper config files +- This changelog + +### Changed +- Rewrote almost everything +- Redesigned CLI + +### Removed +- Backwards compatibility with 2.x diff --git a/README.md b/README.md index 5b74de5..9f82f4f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ Other resources: +- [Changelog](CHANGELOG.md) - [Development Guide](DEV.md) ## Installation with pip From 0096d83387a75d7367e3fc42cfb4a58a1c5191f4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 16:37:42 +0200 Subject: [PATCH 078/524] Simplify Limiter implementation --- PFERD/limiter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/PFERD/limiter.py b/PFERD/limiter.py index ff91d57..ae72fe6 100644 --- a/PFERD/limiter.py +++ b/PFERD/limiter.py @@ -9,8 +9,5 @@ class Limiter: @asynccontextmanager async def limit(self) -> AsyncIterator[None]: - await self._semaphore.acquire() - try: + async with self._semaphore: yield - finally: - self._semaphore.release() From f776186480bd4f1955edbcc54365379ee0478e00 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 16:52:00 +0200 Subject: [PATCH 079/524] Use PurePath instead of Path Path should only be used when we need to access the file system. For all other purposes (mainly crawling), we use PurePath instead since the paths don't correspond to paths in the local file system. --- PFERD/crawler.py | 9 +++++---- PFERD/crawlers/dummy.py | 6 +++--- PFERD/transformer.py | 26 +++++++++++++------------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 376cada..9ceca20 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,7 +1,7 @@ import configparser from abc import ABC, abstractmethod from contextlib import asynccontextmanager -from pathlib import Path +from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import AsyncContextManager, AsyncIterator, Optional @@ -38,7 +38,8 @@ class Crawler(ABC): e.pretty_print() raise CrawlerLoadException() - # output_dir = Path(section.get("output_dir", name)) + # working_dir = Path(section.get("working_dir", "")) + # output_dir = working_dir / section.get("output_dir", name) def print(self, text: str) -> None: """ @@ -75,14 +76,14 @@ class Crawler(ABC): with self._conductor.progress_bar(desc, total=total) as bar: yield bar - def crawl_bar(self, path: Path) -> AsyncContextManager[ProgressBar]: + def crawl_bar(self, path: PurePath) -> AsyncContextManager[ProgressBar]: pathstr = escape(str(path)) desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}" return self.progress_bar(desc) def download_bar( self, - path: Path, + path: PurePath, size: int, ) -> AsyncContextManager[ProgressBar]: pathstr = escape(str(path)) diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py index 46a7a69..204b4b1 100644 --- a/PFERD/crawlers/dummy.py +++ b/PFERD/crawlers/dummy.py @@ -1,6 +1,6 @@ import asyncio import random -from pathlib import Path +from pathlib import PurePath from typing import Any from rich.markup import escape @@ -37,9 +37,9 @@ DUMMY_TREE = { class DummyCrawler(Crawler): async def crawl(self) -> None: - await self._crawl_entry(Path(), DUMMY_TREE) + await self._crawl_entry(PurePath(), DUMMY_TREE) - async def _crawl_entry(self, path: Path, value: Any) -> None: + async def _crawl_entry(self, path: PurePath, value: Any) -> None: if value is True: async with self.exclusive_output(): await ainput(f"File {path}, please press enter: ") diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 1ecaf19..298c580 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -1,22 +1,22 @@ import re from abc import ABC, abstractmethod from dataclasses import dataclass -from pathlib import Path +from pathlib import PurePath from typing import Dict, Optional, Union class Rule(ABC): @abstractmethod - def transform(self, path: Path) -> Optional[Path]: + def transform(self, path: PurePath) -> Optional[PurePath]: pass class NormalRule(Rule): - def __init__(self, left: Path, right: Path): + def __init__(self, left: PurePath, right: PurePath): self._left = left self._right = right - def _match_prefix(self, path: Path) -> Optional[Path]: + def _match_prefix(self, path: PurePath) -> Optional[PurePath]: left_parts = list(reversed(self._left.parts)) path_parts = list(reversed(path.parts)) @@ -33,9 +33,9 @@ class NormalRule(Rule): if left_parts: return None - return Path(*path_parts) + return PurePath(*path_parts) - def transform(self, path: Path) -> Optional[Path]: + def transform(self, path: PurePath) -> Optional[PurePath]: if rest := self._match_prefix(path): return self._right / rest @@ -43,11 +43,11 @@ class NormalRule(Rule): class ExactRule(Rule): - def __init__(self, left: Path, right: Path): + def __init__(self, left: PurePath, right: PurePath): self._left = left self._right = right - def transform(self, path: Path) -> Optional[Path]: + def transform(self, path: PurePath) -> Optional[PurePath]: if path == self._left: return self._right @@ -59,7 +59,7 @@ class ReRule(Rule): self._left = left self._right = right - def transform(self, path: Path) -> Optional[Path]: + def transform(self, path: PurePath) -> Optional[PurePath]: if match := re.fullmatch(self._left, str(path)): kwargs: Dict[str, Union[int, float]] = {} @@ -75,7 +75,7 @@ class ReRule(Rule): except ValueError: pass - return Path(self._right.format(*groups, **kwargs)) + return PurePath(self._right.format(*groups, **kwargs)) return None @@ -208,9 +208,9 @@ def parse_rule(line: Line) -> Rule: right = parse_string(line) if arrowname == "": - return NormalRule(Path(left), Path(right)) + return NormalRule(PurePath(left), PurePath(right)) elif arrowname == "exact": - return ExactRule(Path(left), Path(right)) + return ExactRule(PurePath(left), PurePath(right)) elif arrowname == "re": return ReRule(left, right) else: @@ -230,7 +230,7 @@ class Transformer: if line: self._rules.append(parse_rule(Line(line, i))) - def transform(self, path: Path) -> Optional[Path]: + def transform(self, path: PurePath) -> Optional[PurePath]: for rule in self._rules: if result := rule.transform(path): return result From 9ec19be11345e816f243aaf8514ce1be7a5c07cc Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 18:55:08 +0200 Subject: [PATCH 080/524] Document config file format --- CONFIG.md | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + 2 files changed, 139 insertions(+) create mode 100644 CONFIG.md diff --git a/CONFIG.md b/CONFIG.md new file mode 100644 index 0000000..8acb97c --- /dev/null +++ b/CONFIG.md @@ -0,0 +1,138 @@ +# Config file format + +A config file consists of sections. A section begins with a `[section]` header, +which is followed by a list of `key = value` or `key: value` pairs. Comments +must be on their own line and start with `#` or `;`. Multiline values must be +indented beyond their key. For more details and some examples on the format, see +the [configparser documentation][1] ([basic interpolation][2] is enabled). + +[1]: "Supported INI File Structure" +[2]: "BasicInterpolation" + +## The `DEFAULT` section + +This section contains global configuration values. It can also be used to set +default values for the other sections. + +- `working_dir`: The directory PFERD operates in. Set to an absolute path to + make PFERD operate the same regardless of where it is executed. All other + paths in the config file are interpreted relative to this path. If this path + is relative, it is interpreted relative to the script's working dir. `~` is + expanded to the current user's home directory. (Default: `.`) + +## The `crawl:*` sections + +Sections whose names start with `crawl:` are used to configure crawlers. The +rest of the section name specifies the name of the crawler. + +A crawler synchronizes a remote resource to a local directory. There are +different types of crawlers for different kinds of resources, e. g. ILIAS +courses or lecture websites. + +Each crawl section represents an instance of a specific type of crawler. The +`type` option is used to specify the crawler type. The crawler's name is usually +used as the name for the output directory. New crawlers can be created simply by +adding a new crawl section to the config file. + +Depending on a crawler's type, it may have different options. For more details, +see the type's documentation below. The following options are common to all +crawlers: + +- `type`: The types are specified in [this section](#crawler-types). +- `output_dir`: The directory the crawler synchronizes files to. A crawler will + never place any files outside of this directory. (Default: crawler's name) +- `transform`: Rules for renaming and excluding certain files and directories. + For more details, see [this section](#transformation-rules). (Default: empty) + +## The `auth:*` sections + +Sections whose names start with `auth:` are used to configure authenticators. An +authenticator provides login credentials to one or more crawlers. + +Authenticators work similar to crawlers: A section represents an authenticator +instance, whose name is the rest of the section name. The type is specified by +the `type` option. + +Depending on an authenticator's type, it may have different options. For more +details, see the type's documentation below. The only option common to all +authenticators is `type`: + +- `type`: The types are specified in [this section](#authenticator-types). + +## Crawler types + +TODO Fill in as crawlers are implemented + +## Authenticator types + +TODO Fill in as authenticators are implemented + +## Transformation rules + +Transformation rules are rules for renaming and excluding files and directories. +They are specified line-by-line in a crawler's `transform` option. When a +crawler needs to apply a rule to a path, it goes through this list top-to-bottom +and choose the first matching rule. + +Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. +`SOURCE` is either a normal path without spaces (e. g. `foo/bar`), or a string +literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string +escape syntax is supported. Trailing slashes are ignored. `TARGET` can be +formatted like `SOURCE`, but it can also be a single exclamation mark without +quotes (`!`). `ARROW` is one of `-->`, `-exact->` and `-re->`. + +If a rule's target is `!`, this means that when the rule matches on a path, the +corresponding file or directory is ignored. If a rule's target is missing, the +path is matched but not modified. + +### The `-->` arrow + +The `-->` arrow is a basic renaming operation. If a path begins with `SOURCE`, +that part of the path is replaced with `TARGET`. This means that the rule +`foo/bar --> baz` would convert `foo/bar` into `baz`, but also `foo/bar/xyz` +into `baz/xyz`. The rule `foo --> !` would ignore a directory named `foo` as +well as all its contents. + +### The `-exact->` arrow + +The `-exact->` arrow requires the path to match `SOURCE` exactly. This means +that the rule `foo/bar -exact-> baz` would still convert `foo/bar` into `baz`, +but `foo/bar/xyz` would be unaffected. Also, `foo -exact-> !` would only ignore +`foo`, but not its contents (if it has any). The examples below show why this is +useful. + +### The `-re->` arrow + +The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression +that must match the entire path. If this is the case, then the capturing groups +are available in `TARGET` for formatting. + +### Example: Tutorials + +You have ILIAS course with lots of tutorials, but are only interested in a +single one? + +``` +tutorials/ + |- tut_01/ + |- tut_02/ + |- tut_03/ + ... +``` + +You can use a mix of normal and exact arrows to get rid of the other ones and +move the `tutorials/tut_02/` folder to `my_tut/`: + +``` +tutorials/tut_02 --> my_tut +tutorials -exact-> +tutorials --> ! +``` + +The second rule is required for many crawlers since they use the rules to decide +which directories to crawl. If it was missing when the crawler looks at +`tutorials/`, the third rule would match. This means the crawler would not crawl +the `tutorials/` directory and thus not discover that `tutorials/tut02/` +existed. + +Since the second rule is only relevant for crawling, the `TARGET` is left out. diff --git a/README.md b/README.md index 9f82f4f..f9d718e 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ Other resources: +- [Config file format](CONFIG.md) - [Changelog](CHANGELOG.md) - [Development Guide](DEV.md) From e7a51decb098a30019a1393b723956c99a85ef85 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 29 Apr 2021 20:13:46 +0200 Subject: [PATCH 081/524] Elaborate on transforms and implement changes --- CONFIG.md | 36 ++++++++++++++ PFERD/transformer.py | 111 ++++++++++++++++++++++++++++++++----------- 2 files changed, 120 insertions(+), 27 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 8acb97c..05f3363 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -107,6 +107,21 @@ The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression that must match the entire path. If this is the case, then the capturing groups are available in `TARGET` for formatting. +`TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can +be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. +If capturing group *n*'s contents are a valid integer, the integer value is +available as `{i}` (e. g. `{i3}`). If capturing group *n*'s contents are a +valid float, the float value is available as `{f}` (e. g. `{f3}`). + +Python's format string syntax has rich options for formatting its arguments. For +example, to left-pad the capturing group 3 with the digit `0` to width 5, you +can use `{i3:05}`. + +PFERD even allows you to write entire expressions inside the curly braces, for +example `{g2.lower()}` or `{g3.replace(' ', '_')}`. + +[3]: "Format String Syntax" + ### Example: Tutorials You have ILIAS course with lots of tutorials, but are only interested in a @@ -136,3 +151,24 @@ the `tutorials/` directory and thus not discover that `tutorials/tut02/` existed. Since the second rule is only relevant for crawling, the `TARGET` is left out. + +### Example: Lecture slides + +You have a course with slides like `Lecture 3: Linear functions.PDF` and you +would like to rename them to `03_linear_functions.pdf`. + +``` +Lectures/ + |- Lecture 1: Introduction.PDF + |- Lecture 2: Vectors and matrices.PDF + |- Lecture 3: Linear functions.PDF + ... +``` + +To do this, you can use the most powerful of arrows, the regex arrow. + +``` +"Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" +``` + +Note the escaped backslashes on the `SOURCE` side. diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 298c580..84332df 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -1,3 +1,9 @@ +# I'm sorry that this code has become a bit dense and unreadable. While +# reading, it is important to remember what True and False mean. I'd love to +# have some proper sum-types for the inputs and outputs, they'd make this code +# a lot easier to understand. + +import ast import re from abc import ABC, abstractmethod from dataclasses import dataclass @@ -7,12 +13,23 @@ from typing import Dict, Optional, Union class Rule(ABC): @abstractmethod - def transform(self, path: PurePath) -> Optional[PurePath]: + def transform(self, path: PurePath) -> Union[PurePath, bool]: + """ + Try to apply this rule to the path. Returns another path if the rule + was successfully applied, True if the rule matched but resulted in an + exclamation mark, and False if the rule didn't match at all. + """ + pass +# These rules all use a Union[T, bool] for their right side. They are passed a +# T if the arrow's right side was a normal string, True if it was an +# exclamation mark and False if it was missing entirely. + class NormalRule(Rule): - def __init__(self, left: PurePath, right: PurePath): + def __init__(self, left: PurePath, right: Union[PurePath, bool]): + self._left = left self._right = right @@ -35,49 +52,61 @@ class NormalRule(Rule): return PurePath(*path_parts) - def transform(self, path: PurePath) -> Optional[PurePath]: + def transform(self, path: PurePath) -> Union[PurePath, bool]: if rest := self._match_prefix(path): - return self._right / rest + if isinstance(self._right, bool): + return self._right or path + else: + return self._right / rest - return None + return False class ExactRule(Rule): - def __init__(self, left: PurePath, right: PurePath): + def __init__(self, left: PurePath, right: Union[PurePath, bool]): self._left = left self._right = right - def transform(self, path: PurePath) -> Optional[PurePath]: + def transform(self, path: PurePath) -> Union[PurePath, bool]: if path == self._left: - return self._right + if isinstance(self._right, bool): + return self._right or path + else: + return self._right - return None + return False class ReRule(Rule): - def __init__(self, left: str, right: str): + def __init__(self, left: str, right: Union[str, bool]): self._left = left self._right = right - def transform(self, path: PurePath) -> Optional[PurePath]: + def transform(self, path: PurePath) -> Union[PurePath, bool]: if match := re.fullmatch(self._left, str(path)): - kwargs: Dict[str, Union[int, float]] = {} + if isinstance(self._right, bool): + return self._right or path + + vars: Dict[str, Union[str, int, float]] = {} groups = [match[0]] + list(match.groups()) for i, group in enumerate(groups): + vars[f"g{i}"] = group + try: - kwargs[f"i{i}"] = int(group) + vars[f"i{i}"] = int(group) except ValueError: pass try: - kwargs[f"f{i}"] = float(group) + vars[f"f{i}"] = float(group) except ValueError: pass - return PurePath(self._right.format(*groups, **kwargs)) + result = eval(f"f{self._right!r}", vars) + return PurePath(result) - return None + return False @dataclass @@ -136,7 +165,9 @@ QUOTATION_MARKS = {'"', "'"} def parse_string_literal(line: Line) -> str: escaped = False - result = [] + + # Points to first character of string literal + start_index = line.index quotation_mark = line.get() if quotation_mark not in QUOTATION_MARKS: @@ -147,17 +178,17 @@ def parse_string_literal(line: Line) -> str: while c := line.get(): if escaped: - result.append(c) escaped = False line.advance() elif c == quotation_mark: line.advance() - return "".join(result) + stop_index = line.index + literal = line.line[start_index:stop_index] + return ast.literal_eval(literal) elif c == "\\": escaped = True line.advance() else: - result.append(c) line.advance() raise RuleParseException(line, "Expected end of string literal") @@ -174,11 +205,14 @@ def parse_until_space_or_eol(line: Line) -> str: return "".join(result) -def parse_string(line: Line) -> str: +def parse_string(line: Line) -> Union[str, bool]: if line.get() in QUOTATION_MARKS: return parse_string_literal(line) else: - return parse_until_space_or_eol(line) + string = parse_until_space_or_eol(line) + if string == "!": + return True + return string def parse_arrow(line: Line) -> str: @@ -200,17 +234,35 @@ def parse_arrow(line: Line) -> str: def parse_rule(line: Line) -> Rule: + # Parse left side + leftindex = line.index left = parse_string(line) + if isinstance(left, bool): + line.index = leftindex + raise RuleParseException(line, "Left side can't be '!'") + + # Parse arrow line.expect(" ") arrowindex = line.index arrowname = parse_arrow(line) - line.expect(" ") - right = parse_string(line) + # Parse right side + if line.get(): + line.expect(" ") + right = parse_string(line) + else: + right = False + rightpath: Union[PurePath, bool] + if isinstance(right, bool): + rightpath = right + else: + rightpath = PurePath(right) + + # Dispatch if arrowname == "": - return NormalRule(PurePath(left), PurePath(right)) + return NormalRule(PurePath(left), rightpath) elif arrowname == "exact": - return ExactRule(PurePath(left), PurePath(right)) + return ExactRule(PurePath(left), rightpath) elif arrowname == "re": return ReRule(left, right) else: @@ -232,7 +284,12 @@ class Transformer: def transform(self, path: PurePath) -> Optional[PurePath]: for rule in self._rules: - if result := rule.transform(path): + result = rule.transform(path) + if isinstance(result, PurePath): return result + elif result: # Exclamation mark + return None + else: + continue return None From a8dcf941b94d8f474f2606a92d1caf57ccb61665 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Apr 2021 15:32:56 +0200 Subject: [PATCH 082/524] Document possible redownload settings --- CONFIG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 05f3363..a004dc3 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -41,6 +41,15 @@ crawlers: - `type`: The types are specified in [this section](#crawler-types). - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: crawler's name) +- `redownload`: When to download again a file that is already present locally. + (Default: `never-smart`) + - `never`: If a file is present locally, it is not downloaded again. + - `never-smart`: Like `never`, but PFERD tries to detect if an already + downloaded files has changed via some (unreliable) heuristics. + - `always`: All files are always downloaded, regardless of whether they are + already present locally. + - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary + downloads via some (unreliable) heuristics. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) From 91c33596daf267541a0f389de252c193d9c2c05e Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Apr 2021 16:22:14 +0200 Subject: [PATCH 083/524] Load crawlers from config file --- PFERD/config.py | 21 +++++++++++++++++++-- PFERD/crawler.py | 8 +++++++- PFERD/pferd.py | 46 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 69 insertions(+), 6 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index d71e4d1..d02900d 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,7 +1,7 @@ import configparser import os from pathlib import Path -from typing import Optional +from typing import List, Optional, Tuple from .utils import prompt_yes_no @@ -26,7 +26,6 @@ class Config: def __init__(self, parser: configparser.ConfigParser): self._parser = parser - # TODO Load and validate config into dataclasses @staticmethod def _fail_load(path: Path, reason: str) -> None: @@ -99,3 +98,21 @@ class Config: self._fail_dump(path, "That's a directory, not a file") except PermissionError: self._fail_dump(path, "Insufficient permissions") + + @property + def default_section(self) -> configparser.SectionProxy: + return self._parser[configparser.DEFAULTSECT] + + def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]: + result = [] + for section_name, section_proxy in self._parser.items(): + if section_name.startswith("crawler:"): + crawler_name = section_name[8:] + result.append((crawler_name, section_proxy)) + + return result + + @property + def working_dir(self) -> Path: + pathstr = self.default_section.get("working_dir", ".") + return Path(pathstr).expanduser() diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 9ceca20..6b1b350 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -8,6 +8,7 @@ from typing import AsyncContextManager, AsyncIterator, Optional from rich.markup import escape from .conductor import ProgressBar, TerminalConductor +from .config import Config from .limiter import Limiter from .transformer import RuleParseException, Transformer @@ -17,7 +18,12 @@ class CrawlerLoadException(Exception): class Crawler(ABC): - def __init__(self, name: str, section: configparser.SectionProxy) -> None: + def __init__( + self, + name: str, + config: Config, + section: configparser.SectionProxy, + ) -> None: """ Initialize a crawler from its name and its section in the config file. diff --git a/PFERD/pferd.py b/PFERD/pferd.py index d145ade..131ddc1 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,12 +1,52 @@ +from typing import Dict + +from rich import print +from rich.markup import escape + from .config import Config +from .crawler import Crawler from .crawlers import CRAWLERS +class PferdLoadException(Exception): + pass + + class Pferd: def __init__(self, config: Config): self._config = config + self._crawlers: Dict[str, Crawler] = {} + + def _load_crawlers(self) -> None: + abort = False + for name, section in self._config.crawler_sections(): + print(f"[bold bright_cyan]Loading[/] crawler:{escape(name)}") + crawler_type = section.get("type") + crawler_constructor = CRAWLERS.get(crawler_type) + if crawler_constructor is None: + abort = True + if crawler_type is None: + print("[red]Error: No type") + else: + t = escape(repr(crawler_type)) + print(f"[red]Error: Unknown type {t}") + continue + + crawler = crawler_constructor(name, self._config, section) + self._crawlers[name] = crawler + + if abort: + raise PferdLoadException() async def run(self) -> None: - print("Bleep bloop 1") - await CRAWLERS["dummy"]("dummy", self._config._parser["dummy"]).run() - print("Bleep bloop 2") + try: + self._load_crawlers() + except PferdLoadException: + print("[bold red]Could not initialize PFERD properly") + exit(1) + + for name, crawler in self._crawlers.items(): + print() + print(f"[bold bright_cyan]Running[/] crawler:{escape(name)}") + + await crawler.run() From 07e831218e975ff82637f5016d40ab0112882652 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 2 May 2021 00:56:10 +0200 Subject: [PATCH 084/524] Add sync report --- PFERD/report.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 PFERD/report.py diff --git a/PFERD/report.py b/PFERD/report.py new file mode 100644 index 0000000..382b82d --- /dev/null +++ b/PFERD/report.py @@ -0,0 +1,80 @@ +from dataclasses import dataclass +from pathlib import PurePath +from typing import Set + + +@dataclass +class MarkDuplicateException(Exception): + """ + Tried to mark a file that was already marked. + """ + + path: PurePath + + +@dataclass +class MarkConflictException(Exception): + """ + Marking the path would have caused a conflict. + + A conflict can have two reasons: Either the new file has the same path as + the parent directory of a known file, or a parent directory of the new file + has the same path as a known file. In either case, adding the new file + would require a file and a directory to share the same path, which is + usually not possible. + """ + + path: PurePath + collides_with: PurePath + + +class Report: + """ + A report of a synchronization. Includes all files found by the crawler, as + well as the set of changes made to local files. + """ + + def __init__(self): + self.known_files: Set[PurePath] = set() + + self.new_files: Set[PurePath] = set() + self.changed_files: Set[PurePath] = set() + self.deleted_files: Set[PurePath] = set() + + def mark(self, path: PurePath): + """ + Mark a previously unknown file as known. + + May throw a MarkDuplicateException or a MarkConflictException. For more + detail, see the respective exception's docstring. + """ + + for known_path in self.known_files: + if path == known_path: + raise MarkDuplicateException(path) + + if path.relative_to(known_path) or known_path.relative_to(path): + raise MarkConflictException(path, known_path) + + self.known_files.add(path) + + def add_file(self, path: PurePath): + """ + Unlike mark(), this function accepts any paths. + """ + + self.new_files.add(path) + + def change_file(self, path: PurePath): + """ + Unlike mark(), this function accepts any paths. + """ + + self.changed_files.add(path) + + def delete_file(self, path: PurePath): + """ + Unlike mark(), this function accepts any paths. + """ + + self.deleted_files.add(path) From fde811ae5aee4a87c7d2891c23e12c2fa554676f Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 5 May 2021 00:55:55 +0200 Subject: [PATCH 085/524] Document on_conflict option --- CONFIG.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index a004dc3..65daae9 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -50,6 +50,18 @@ crawlers: already present locally. - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary downloads via some (unreliable) heuristics. +- `on_conflict`: What to do when the local and remote versions of a file or + directory differ. Includes the cases where a file is replaced by a directory + or a directory by a file. (Default: `prompt`) + - `prompt`: Always ask the user before overwriting or deleting local files + and directories. + - `local-first`: Always keep the local file or directory. Equivalent to + using `prompt` and always choosing "no". Implies that `redownload` is set + to `never`. + - `remote-first`: Always keep the remote file or directory. Equivalent to + using `prompt` and always choosing "yes". + - `no-delete`: Never delete local files, but overwrite local files if the + remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) @@ -133,7 +145,7 @@ example `{g2.lower()}` or `{g3.replace(' ', '_')}`. ### Example: Tutorials -You have ILIAS course with lots of tutorials, but are only interested in a +You have an ILIAS course with lots of tutorials, but are only interested in a single one? ``` @@ -174,7 +186,7 @@ Lectures/ ... ``` -To do this, you can use the most powerful of arrows, the regex arrow. +To do this, you can use the most powerful of arrows: The regex arrow. ``` "Lectures/Lecture (\\d+): (.*)\\.PDF" -re-> "Lectures/{i1:02}_{g2.lower().replace(' ', '_')}.pdf" From bbfdadc4633997b5437ba6d4f98db41fb69e2390 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 5 May 2021 18:08:34 +0200 Subject: [PATCH 086/524] Implement output directory --- PFERD/crawler.py | 8 +- PFERD/output_dir.py | 365 ++++++++++++++++++++++++++++++++++++++++++++ PFERD/pferd.py | 7 +- PFERD/report.py | 13 +- 4 files changed, 381 insertions(+), 12 deletions(-) create mode 100644 PFERD/output_dir.py diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 6b1b350..4ee4fad 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -10,6 +10,7 @@ from rich.markup import escape from .conductor import ProgressBar, TerminalConductor from .config import Config from .limiter import Limiter +from .output_dir import OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer @@ -44,8 +45,11 @@ class Crawler(ABC): e.pretty_print() raise CrawlerLoadException() - # working_dir = Path(section.get("working_dir", "")) - # output_dir = working_dir / section.get("output_dir", name) + output_dir = config.working_dir / section.get("output_dir", name) + redownload = Redownload.NEVER_SMART + on_conflict = OnConflict.PROMPT + self._output_dir = OutputDirectory( + output_dir, redownload, on_conflict, self._conductor) def print(self, text: str) -> None: """ diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py new file mode 100644 index 0000000..9276069 --- /dev/null +++ b/PFERD/output_dir.py @@ -0,0 +1,365 @@ +import filecmp +import os +import random +import shutil +import string +from contextlib import asynccontextmanager +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from pathlib import Path, PurePath +# TODO In Python 3.9 and above, AsyncContextManager is deprecated +from typing import AsyncContextManager, AsyncIterator, BinaryIO, Optional + +from .conductor import TerminalConductor +from .report import MarkConflictException, MarkDuplicateException, Report +from .utils import prompt_yes_no + +SUFFIX_CHARS = string.ascii_lowercase + string.digits +SUFFIX_LENGTH = 6 +TRIES = 5 + + +class OutputDirException(Exception): + pass + + +class Redownload(Enum): + NEVER = "never" + NEVER_SMART = "never-smart" + ALWAYS = "always" + ALWAYS_SMART = "always-smart" + + +class OnConflict(Enum): + PROMPT = "prompt" + LOCAL_FIRST = "local-first" + REMOTE_FIRST = "remote-first" + NO_DELETE = "no-delete" + + +@dataclass +class Heuristics: + mtime: Optional[datetime] + + +class FileSink: + def __init__(self, file: BinaryIO): + self._file = file + self._done = False + + @property + def file(self) -> BinaryIO: + return self._file + + def done(self) -> None: + self._done = True + + def is_done(self) -> bool: + return self._done + + +@dataclass +class DownloadInfo: + path: PurePath + local_path: Path + tmp_path: Path + heuristics: Heuristics + on_conflict: OnConflict + success: bool = False + + +class OutputDirectory: + def __init__( + self, + root: Path, + redownload: Redownload, + on_conflict: OnConflict, + conductor: TerminalConductor, + ): + self._root = root + self._redownload = redownload + self._on_conflict = on_conflict + self._conductor = conductor + + self._report = Report() + + def _mark(self, path: PurePath) -> None: + """ + May throw an OutputDirException + """ + + try: + self._report.mark(path) + except MarkDuplicateException: + msg = "Another file has already been placed here." + raise OutputDirException(msg) + except MarkConflictException as e: + msg = f"Collides with other file: {e.collides_with}" + raise OutputDirException(msg) + + def _resolve(self, path: PurePath) -> Path: + """ + May throw an OutputDirException. + """ + + if ".." in path.parts: + msg = f"Path {path} contains forbidden '..'" + raise OutputDirException(msg) + return self._root / path + + def _should_download( + self, + local_path: Path, + heuristics: Heuristics, + redownload: Redownload, + ) -> bool: + # If we don't have a *file* at the local path, we'll always redownload + # since we know that the remote is different from the local files. This + # includes the case where no local file exists. + if not local_path.is_file(): + return True + + if redownload == Redownload.NEVER: + return False + elif redownload == Redownload.ALWAYS: + return True + + stat = local_path.stat() + + remote_newer = None + if mtime := heuristics.mtime: + remote_newer = mtime.timestamp() > stat.st_mtime + + if redownload == Redownload.NEVER_SMART: + if remote_newer is None: + return False + else: + return remote_newer + elif redownload == Redownload.ALWAYS_SMART: + if remote_newer is None: + return True + else: + return not remote_newer + + # This should never be reached + raise ValueError(f"{redownload!r} is not a valid redownload policy") + + # The following conflict resolution functions all return False if the local + # file(s) should be kept and True if they should be replaced by the remote + # files. + + async def _conflict_lfrf( + self, + on_conflict: OnConflict, + path: PurePath, + ) -> bool: + if on_conflict == OnConflict.PROMPT: + async with self._conductor.exclusive_output(): + prompt = f"Replace {path} with remote file?" + return await prompt_yes_no(prompt, default=False) + elif on_conflict == OnConflict.LOCAL_FIRST: + return False + elif on_conflict == OnConflict.REMOTE_FIRST: + return True + elif on_conflict == OnConflict.NO_DELETE: + return True + + # This should never be reached + raise ValueError(f"{on_conflict!r} is not a valid conflict policy") + + async def _conflict_ldrf( + self, + on_conflict: OnConflict, + path: PurePath, + ) -> bool: + if on_conflict == OnConflict.PROMPT: + async with self._conductor.exclusive_output(): + prompt = f"Recursively delete {path} and replace with remote file?" + return await prompt_yes_no(prompt, default=False) + elif on_conflict == OnConflict.LOCAL_FIRST: + return False + elif on_conflict == OnConflict.REMOTE_FIRST: + return True + elif on_conflict == OnConflict.NO_DELETE: + return False + + # This should never be reached + raise ValueError(f"{on_conflict!r} is not a valid conflict policy") + + async def _conflict_lfrd( + self, + on_conflict: OnConflict, + path: PurePath, + parent: PurePath, + ) -> bool: + if on_conflict == OnConflict.PROMPT: + async with self._conductor.exclusive_output(): + prompt = f"Delete {parent} so remote file {path} can be downloaded?" + return await prompt_yes_no(prompt, default=False) + elif on_conflict == OnConflict.LOCAL_FIRST: + return False + elif on_conflict == OnConflict.REMOTE_FIRST: + return True + elif on_conflict == OnConflict.NO_DELETE: + return False + + # This should never be reached + raise ValueError(f"{on_conflict!r} is not a valid conflict policy") + + async def _conflict_delete_lf( + self, + on_conflict: OnConflict, + path: PurePath, + ) -> bool: + if on_conflict == OnConflict.PROMPT: + async with self._conductor.exclusive_output(): + prompt = f"Delete {path}?" + return await prompt_yes_no(prompt, default=False) + elif on_conflict == OnConflict.LOCAL_FIRST: + return False + elif on_conflict == OnConflict.REMOTE_FIRST: + return True + elif on_conflict == OnConflict.NO_DELETE: + return False + + # This should never be reached + raise ValueError(f"{on_conflict!r} is not a valid conflict policy") + + def _tmp_path(self, base: Path, suffix_length: int) -> Path: + prefix = "" if base.name.startswith(".") else "." + suffix = random.choices(SUFFIX_CHARS, k=suffix_length) + name = f"{prefix}{base.name}.tmp.{suffix}" + return base.parent / name + + @asynccontextmanager + async def _sink_context_manager( + self, + file: BinaryIO, + info: DownloadInfo, + ) -> AsyncIterator[FileSink]: + sink = FileSink(file) + try: + with file: + yield sink + finally: + info.success = sink.is_done() + await self._after_download(info) + + async def download( + self, + path: PurePath, + mtime: Optional[datetime] = None, + redownload: Optional[Redownload] = None, + on_conflict: Optional[OnConflict] = None, + ) -> Optional[AsyncContextManager[FileSink]]: + """ + May throw an OutputDirException. + """ + + heuristics = Heuristics(mtime) + redownload = self._redownload if redownload is None else redownload + on_conflict = self._on_conflict if on_conflict is None else on_conflict + local_path = self._resolve(path) + + self._mark(path) + + if not self._should_download(local_path, heuristics, redownload): + return None + + # Detect and solve local-dir-remote-file conflict + if local_path.is_dir(): + if await self._conflict_ldrf(on_conflict, path): + shutil.rmtree(local_path) + else: + return None + + # Detect and solve local-file-remote-dir conflict + for parent in path.parents: + local_parent = self._resolve(parent) + if local_parent.exists() and not local_parent.is_dir(): + if await self._conflict_lfrd(on_conflict, path, parent): + local_parent.unlink() + break + else: + return None + + # Ensure parent directory exists + local_path.parent.mkdir(parents=True, exist_ok=True) + + # Create tmp file + for attempt in range(TRIES): + suffix_length = SUFFIX_LENGTH + 2 * attempt + tmp_path = self._tmp_path(local_path, suffix_length) + info = DownloadInfo(path, local_path, tmp_path, + heuristics, on_conflict) + try: + file = open(tmp_path, "bx") + return self._sink_context_manager(file, info) + except FileExistsError: + pass # Try again + + return None + + async def _after_download(self, info: DownloadInfo) -> None: + changed = False + + if not info.success: + info.tmp_path.unlink() + return + + # Solve conflicts arising from existing local file + if info.local_path.exists(): + changed = True + if filecmp.cmp(info.local_path, info.tmp_path): + info.tmp_path.unlink() + return + + if not await self._conflict_lfrf(info.on_conflict, info.path): + info.tmp_path.unlink() + return + + # Modify metadata if necessary + if mtime := info.heuristics.mtime: + # TODO Pick an implementation + # Rounding up to avoid inaccuracies in how the OS stores timestamps + # mtimestamp = math.ceil(mtime.timestamp()) + mtimestamp = mtime.timestamp() + os.utime(info.tmp_path, times=(mtimestamp, mtimestamp)) + + info.tmp_path.replace(info.local_path) + + if changed: + self._report.change_file(info.path) + else: + self._report.add_file(info.path) + + def cleanup(self) -> None: + self._cleanup_dir(self._root, PurePath()) + + def _cleanup(self, path: Path, pure: PurePath) -> None: + if path.is_dir(): + self._cleanup_dir(path, pure) + elif path.is_file(): + self._cleanup_file(path, pure) + + def _cleanup_dir(self, path: Path, pure: PurePath) -> None: + for child in path.iterdir(): + pure_child = pure / child.name + self._cleanup(child, pure_child) + + try: + path.rmdir() + except OSError: + pass + + def _cleanup_file(self, path: Path, pure: PurePath) -> None: + if self._report.marked(pure): + return + + if self._conflict_delete_lf(self._on_conflict, pure): + try: + path.unlink() + self._report.delete_file(pure) + except OSError: + pass diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 131ddc1..54356c1 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -25,11 +25,8 @@ class Pferd: crawler_constructor = CRAWLERS.get(crawler_type) if crawler_constructor is None: abort = True - if crawler_type is None: - print("[red]Error: No type") - else: - t = escape(repr(crawler_type)) - print(f"[red]Error: Unknown type {t}") + t = escape(repr(crawler_type)) + print(f"[red]Error: Unknown type {t}") continue crawler = crawler_constructor(name, self._config, section) diff --git a/PFERD/report.py b/PFERD/report.py index 382b82d..38e8130 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -34,14 +34,14 @@ class Report: well as the set of changes made to local files. """ - def __init__(self): + def __init__(self) -> None: self.known_files: Set[PurePath] = set() self.new_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() - def mark(self, path: PurePath): + def mark(self, path: PurePath) -> None: """ Mark a previously unknown file as known. @@ -58,21 +58,24 @@ class Report: self.known_files.add(path) - def add_file(self, path: PurePath): + def marked(self, path: PurePath) -> bool: + return path in self.known_files + + def add_file(self, path: PurePath) -> None: """ Unlike mark(), this function accepts any paths. """ self.new_files.add(path) - def change_file(self, path: PurePath): + def change_file(self, path: PurePath) -> None: """ Unlike mark(), this function accepts any paths. """ self.changed_files.add(path) - def delete_file(self, path: PurePath): + def delete_file(self, path: PurePath) -> None: """ Unlike mark(), this function accepts any paths. """ From 5497dd28275764d7af5dbac6c94452f1b71c8bab Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 5 May 2021 23:36:54 +0200 Subject: [PATCH 087/524] Add @noncritical and @repeat decorators --- PFERD/crawler.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4ee4fad..4cb48a9 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -3,7 +3,8 @@ from abc import ABC, abstractmethod from contextlib import asynccontextmanager from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import AsyncContextManager, AsyncIterator, Optional +from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, + Callable, Optional, Protocol, TypeVar) from rich.markup import escape @@ -18,6 +19,78 @@ class CrawlerLoadException(Exception): pass +class CrawlerMemberFunction(Protocol): + def __call__( + self, + __self: "Crawler", + *__args: Any, + **__kwargs: Any, + ) -> None: + pass + + +Wrapped = TypeVar("Wrapped", bound=CrawlerMemberFunction) + + +def noncritical(f: Wrapped) -> Wrapped: + def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + try: + f(self, *args, **kwargs) + except Exception as e: + self.print(f"[red]Something went wrong: {escape(str(e))}") + self._error_free = False + return wrapper # type: ignore + + +def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: + def decorator(f: Wrapped) -> Wrapped: + def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + for _ in range(attempts - 1): + try: + f(self, *args, **kwargs) + return + except Exception: + pass + f(self, *args, **kwargs) + return wrapper # type: ignore + return decorator + + +class ACrawlerMemberFunction(Protocol): + def __call__( + self, + __self: "Crawler", + *__args: Any, + **__kwargs: Any, + ) -> Awaitable[None]: + pass + + +AWrapped = TypeVar("AWrapped", bound=ACrawlerMemberFunction) + + +def anoncritical(f: AWrapped) -> AWrapped: + async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + try: + await f(self, *args, **kwargs) + except Exception as e: + self.print(f"[red]Something went wrong: {escape(str(e))}") + self._error_free = False + return wrapper # type: ignore + + +def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: + def decorator(f: AWrapped) -> AWrapped: + async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + for _ in range(attempts - 1): + try: + await f(self, *args, **kwargs) + return + except Exception: + pass + await f(self, *args, **kwargs) + return wrapper # type: ignore + return decorator class Crawler(ABC): def __init__( self, @@ -51,6 +124,8 @@ class Crawler(ABC): self._output_dir = OutputDirectory( output_dir, redownload, on_conflict, self._conductor) + self._error_free = False + def print(self, text: str) -> None: """ Print rich markup to the terminal. Crawlers *must* use this function to From 273d56c39a8440aca743188ddb56e7c50a4f109d Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 5 May 2021 23:45:10 +0200 Subject: [PATCH 088/524] Properly load crawler config --- PFERD/config.py | 38 ++++++++++++++++++++++------ PFERD/crawler.py | 52 +++++++++++++++++++++++++++++++------- PFERD/crawlers/__init__.py | 9 +++++-- 3 files changed, 80 insertions(+), 19 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index d02900d..f2abe8d 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,7 +1,8 @@ -import configparser import os +from configparser import ConfigParser, SectionProxy +from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, List, NoReturn, Optional, Tuple from .utils import prompt_yes_no @@ -14,6 +15,27 @@ class ConfigDumpException(Exception): pass +@dataclass +class ConfigFormatException(Exception): + section: str + key: str + desc: str + + +class Section: + def __init__(self, section: SectionProxy): + self.s = section + + def error(self, key: str, desc: str) -> NoReturn: + raise ConfigFormatException(self.s.name, key, desc) + + def invalid_value(self, key: str, value: Any) -> NoReturn: + self.error(key, f"Invalid value: {value!r}") + + def missing_value(self, key: str) -> NoReturn: + self.error(key, "Missing value") + + class Config: @staticmethod def _default_path() -> Path: @@ -24,7 +46,7 @@ class Config: else: return Path("~/.pferd.cfg").expanduser() - def __init__(self, parser: configparser.ConfigParser): + def __init__(self, parser: ConfigParser): self._parser = parser @staticmethod @@ -34,7 +56,7 @@ class Config: raise ConfigLoadException() @staticmethod - def load_parser(path: Optional[Path] = None) -> configparser.ConfigParser: + def load_parser(path: Optional[Path] = None) -> ConfigParser: """ May throw a ConfigLoadException. """ @@ -42,7 +64,7 @@ class Config: if not path: path = Config._default_path() - parser = configparser.ConfigParser() + parser = ConfigParser() # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. @@ -100,10 +122,10 @@ class Config: self._fail_dump(path, "Insufficient permissions") @property - def default_section(self) -> configparser.SectionProxy: - return self._parser[configparser.DEFAULTSECT] + def default_section(self) -> SectionProxy: + return self._parser[self._parser.default_section] - def crawler_sections(self) -> List[Tuple[str, configparser.SectionProxy]]: + def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for section_name, section_proxy in self._parser.items(): if section_name.startswith("crawler:"): diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4cb48a9..ff779ab 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,7 +1,6 @@ -import configparser from abc import ABC, abstractmethod from contextlib import asynccontextmanager -from pathlib import PurePath +from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Optional, Protocol, TypeVar) @@ -9,7 +8,7 @@ from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, from rich.markup import escape from .conductor import ProgressBar, TerminalConductor -from .config import Config +from .config import Config, Section from .limiter import Limiter from .output_dir import OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer @@ -91,12 +90,46 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: await f(self, *args, **kwargs) return wrapper # type: ignore return decorator + + +class CrawlerSection(Section): + def output_dir(self, name: str) -> Path: + return Path(self.s.get("output_dir", name)) + + def redownload(self) -> Redownload: + value = self.s.get("redownload", "never-smart") + if value == "never": + return Redownload.NEVER + elif value == "never-smart": + return Redownload.NEVER_SMART + elif value == "always": + return Redownload.ALWAYS + elif value == "always-smart": + return Redownload.ALWAYS_SMART + self.invalid_value("redownload", value) + + def on_conflict(self) -> OnConflict: + value = self.s.get("on_conflict", "prompt") + if value == "prompt": + return OnConflict.PROMPT + elif value == "local-first": + return OnConflict.LOCAL_FIRST + elif value == "remote-first": + return OnConflict.REMOTE_FIRST + elif value == "no-delete": + return OnConflict.NO_DELETE + self.invalid_value("on_conflict", value) + + def transform(self) -> str: + return self.s.get("transform", "") + + class Crawler(ABC): def __init__( self, name: str, config: Config, - section: configparser.SectionProxy, + section: CrawlerSection, ) -> None: """ Initialize a crawler from its name and its section in the config file. @@ -113,16 +146,17 @@ class Crawler(ABC): self._limiter = Limiter() try: - self._transformer = Transformer(section.get("transform", "")) + self._transformer = Transformer(section.transform()) except RuleParseException as e: e.pretty_print() raise CrawlerLoadException() - output_dir = config.working_dir / section.get("output_dir", name) - redownload = Redownload.NEVER_SMART - on_conflict = OnConflict.PROMPT self._output_dir = OutputDirectory( - output_dir, redownload, on_conflict, self._conductor) + config.working_dir / section.output_dir(name), + section.redownload(), + section.on_conflict(), + self._conductor, + ) self._error_free = False diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 5248a2d..69dac39 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -1,5 +1,10 @@ +from configparser import SectionProxy +from typing import Callable, Dict + +from ..config import Config +from ..crawler import Crawler, CrawlerSection from .dummy import DummyCrawler -CRAWLERS = { - "dummy": DummyCrawler, +CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { + "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), } From 60cd9873bcb9f116827eff6b7bc1c444fb0b786d Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 6 May 2021 01:02:40 +0200 Subject: [PATCH 089/524] Add local file crawler --- PFERD/conductor.py | 3 ++ PFERD/crawler.py | 34 ++++++++++++++------ PFERD/crawlers/__init__.py | 2 ++ PFERD/crawlers/local.py | 63 ++++++++++++++++++++++++++++++++++++++ PFERD/output_dir.py | 2 +- PFERD/pferd.py | 2 +- PFERD/report.py | 11 ++++++- 7 files changed, 104 insertions(+), 13 deletions(-) create mode 100644 PFERD/crawlers/local.py diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 161a287..76d0e2a 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -14,6 +14,9 @@ class ProgressBar: def advance(self, amount: float = 1) -> None: self._progress.advance(self._taskid, advance=amount) + def set_total(self, total) -> None: + self._progress.update(self._taskid, total=total) + class TerminalConductor: def __init__(self) -> None: diff --git a/PFERD/crawler.py b/PFERD/crawler.py index ff779ab..d088b21 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,16 +1,17 @@ from abc import ABC, abstractmethod from contextlib import asynccontextmanager +from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, - Callable, Optional, Protocol, TypeVar) +from typing import (Any, AsyncContextManager, AsyncIterator, Callable, + Coroutine, Optional, Protocol, TypeVar) from rich.markup import escape from .conductor import ProgressBar, TerminalConductor from .config import Config, Section from .limiter import Limiter -from .output_dir import OnConflict, OutputDirectory, Redownload +from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer @@ -37,7 +38,7 @@ def noncritical(f: Wrapped) -> Wrapped: f(self, *args, **kwargs) except Exception as e: self.print(f"[red]Something went wrong: {escape(str(e))}") - self._error_free = False + self.error_free = False return wrapper # type: ignore @@ -61,7 +62,7 @@ class ACrawlerMemberFunction(Protocol): __self: "Crawler", *__args: Any, **__kwargs: Any, - ) -> Awaitable[None]: + ) -> Coroutine[Any, Any, None]: pass @@ -74,7 +75,7 @@ def anoncritical(f: AWrapped) -> AWrapped: await f(self, *args, **kwargs) except Exception as e: self.print(f"[red]Something went wrong: {escape(str(e))}") - self._error_free = False + self.error_free = False return wrapper # type: ignore @@ -94,7 +95,7 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: class CrawlerSection(Section): def output_dir(self, name: str) -> Path: - return Path(self.s.get("output_dir", name)) + return Path(self.s.get("output_dir", name)).expanduser() def redownload(self) -> Redownload: value = self.s.get("redownload", "never-smart") @@ -158,7 +159,7 @@ class Crawler(ABC): self._conductor, ) - self._error_free = False + self.error_free = False def print(self, text: str) -> None: """ @@ -203,11 +204,24 @@ class Crawler(ABC): def download_bar( self, path: PurePath, - size: int, + total: Optional[int] = None, ) -> AsyncContextManager[ProgressBar]: pathstr = escape(str(path)) desc = f"[bold green]Downloading[/bold green] {pathstr}" - return self.progress_bar(desc, total=size) + return self.progress_bar(desc, total=total) + + async def download( + self, + path: PurePath, + mtime: Optional[datetime] = None, + redownload: Optional[Redownload] = None, + on_conflict: Optional[OnConflict] = None, + ) -> Optional[AsyncContextManager[FileSink]]: + return await self._output_dir.download( + path, mtime, redownload, on_conflict) + + async def cleanup(self) -> None: + await self._output_dir.cleanup() async def run(self) -> None: """ diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 69dac39..15ef403 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -4,7 +4,9 @@ from typing import Callable, Dict from ..config import Config from ..crawler import Crawler, CrawlerSection from .dummy import DummyCrawler +from .local import LocalCrawler, LocalCrawlerSection CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), + "local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)), } diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py new file mode 100644 index 0000000..77ebf81 --- /dev/null +++ b/PFERD/crawlers/local.py @@ -0,0 +1,63 @@ +import asyncio +from pathlib import Path, PurePath + +from ..config import Config +from ..crawler import Crawler, CrawlerSection, anoncritical + + +class LocalCrawlerSection(CrawlerSection): + def path(self) -> Path: + value = self.s.get("path") + if value is None: + self.missing_value("path") + return Path(value).expanduser() + + +class LocalCrawler(Crawler): + def __init__( + self, + name: str, + config: Config, + section: LocalCrawlerSection, + ): + super().__init__(name, config, section) + + self._path = section.path() + + async def crawl(self) -> None: + await self._crawl_path(self._path, PurePath()) + if self.error_free: + self.cleanup() + + @anoncritical + async def _crawl_path(self, path: Path, pure: PurePath) -> None: + if path.is_dir(): + await self._crawl_dir(path, pure) + elif path.is_file(): + await self._crawl_file(path, pure) + + async def _crawl_dir(self, path: Path, pure: PurePath) -> None: + tasks = [] + async with self.crawl_bar(pure): + for child in path.iterdir(): + pure_child = pure / child.name + tasks.append(self._crawl_path(child, pure_child)) + await asyncio.gather(*tasks) + + async def _crawl_file(self, path: Path, pure: PurePath) -> None: + async with self.download_bar(path) as bar: + bar.set_total(path.stat().st_size) + + dl = await self.download(pure) + if not dl: + return + + async with dl as sink: + with open(path, "rb") as f: + while True: + data = f.read(1024**2) + if len(data) == 0: + break + sink.file.write(data) + bar.advance(len(data)) + sink.done() diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 9276069..c875574 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -294,7 +294,7 @@ class OutputDirectory: info = DownloadInfo(path, local_path, tmp_path, heuristics, on_conflict) try: - file = open(tmp_path, "bx") + file = open(tmp_path, "xb") return self._sink_context_manager(file, info) except FileExistsError: pass # Try again diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 54356c1..7cdbfa0 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -26,7 +26,7 @@ class Pferd: if crawler_constructor is None: abort = True t = escape(repr(crawler_type)) - print(f"[red]Error: Unknown type {t}") + print(f"[red]Error: Unknown crawler type {t}") continue crawler = crawler_constructor(name, self._config, section) diff --git a/PFERD/report.py b/PFERD/report.py index 38e8130..b98c90c 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -28,6 +28,15 @@ class MarkConflictException(Exception): collides_with: PurePath +# TODO Use PurePath.is_relative_to when updating to 3.9 +def is_relative_to(a: PurePath, b: PurePath) -> bool: + try: + a.relative_to(b) + return True + except ValueError: + return False + + class Report: """ A report of a synchronization. Includes all files found by the crawler, as @@ -53,7 +62,7 @@ class Report: if path == known_path: raise MarkDuplicateException(path) - if path.relative_to(known_path) or known_path.relative_to(path): + if is_relative_to(path, known_path) or is_relative_to(known_path, path): raise MarkConflictException(path, known_path) self.known_files.add(path) From f9b2fd60e2d43d300097704a4933721cbc2c2115 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 9 May 2021 01:33:47 +0200 Subject: [PATCH 090/524] Document local crawler and auth --- CONFIG.md | 24 +++++++++++++++++++++++- PFERD/crawlers/local.py | 2 +- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 65daae9..16c8531 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -65,6 +65,23 @@ crawlers: - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) +Some crawlers may also require credentials for authentication. To configure how +the crawler obtains its credentials, the `auth` option is used. It is set to the +full name of an auth section (including the `auth:` prefix). + +Here is a simple example: + +``` +[auth:example] +type = simple +username = foo +password = bar + +[crawl:something] +type = some-complex-crawler +auth = auth:example +``` + ## The `auth:*` sections Sections whose names start with `auth:` are used to configure authenticators. An @@ -82,7 +99,12 @@ authenticators is `type`: ## Crawler types -TODO Fill in as crawlers are implemented +### The `local` crawler + +This crawler crawls a local directory. It is really simple and mostly useful for +testing different setups. + +- `path`: Path to the local directory to crawl. (No default, must be specified) ## Authenticator types diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 77ebf81..40cc233 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -22,7 +22,7 @@ class LocalCrawler(Crawler): ): super().__init__(name, config, section) - self._path = section.path() + self._path = config.working_dir / section.path() async def crawl(self) -> None: await self._crawl_path(self._path, PurePath()) From cec0a8e1fc2611583c2ee11260686c9a67587561 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 9 May 2021 01:45:01 +0200 Subject: [PATCH 091/524] Fix mymy errors --- PFERD/crawler.py | 62 +++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index d088b21..b8e9d7c 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -3,8 +3,8 @@ from contextlib import asynccontextmanager from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import (Any, AsyncContextManager, AsyncIterator, Callable, - Coroutine, Optional, Protocol, TypeVar) +from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, + Callable, Optional, Protocol, TypeVar) from rich.markup import escape @@ -19,20 +19,17 @@ class CrawlerLoadException(Exception): pass -class CrawlerMemberFunction(Protocol): - def __call__( - self, - __self: "Crawler", - *__args: Any, - **__kwargs: Any, - ) -> None: - pass - - -Wrapped = TypeVar("Wrapped", bound=CrawlerMemberFunction) +Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) def noncritical(f: Wrapped) -> Wrapped: + """ + Warning: Must only be applied to member functions of the Crawler class! + + Catches all exceptions occuring during the function call. If an exception + occurs, the crawler's error_free variable is set to False. + """ + def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: try: f(self, *args, **kwargs) @@ -43,6 +40,14 @@ def noncritical(f: Wrapped) -> Wrapped: def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: + """ + Warning: Must only be applied to member functions of the Crawler class! + + If an exception occurs during the function call, retries the function call + a set amount of times. Exceptions that occur during the last attempt are + not caught and instead passed on upwards. + """ + def decorator(f: Wrapped) -> Wrapped: def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: for _ in range(attempts - 1): @@ -56,20 +61,18 @@ def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: return decorator -class ACrawlerMemberFunction(Protocol): - def __call__( - self, - __self: "Crawler", - *__args: Any, - **__kwargs: Any, - ) -> Coroutine[Any, Any, None]: - pass - - -AWrapped = TypeVar("AWrapped", bound=ACrawlerMemberFunction) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) def anoncritical(f: AWrapped) -> AWrapped: + """ + An async version of @noncritical. + Warning: Must only be applied to member functions of the Crawler class! + + Catches all exceptions occuring during the function call. If an exception + occurs, the crawler's error_free variable is set to False. + """ + async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: try: await f(self, *args, **kwargs) @@ -80,6 +83,15 @@ def anoncritical(f: AWrapped) -> AWrapped: def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: + """ + An async version of @noncritical. + Warning: Must only be applied to member functions of the Crawler class! + + If an exception occurs during the function call, retries the function call + a set amount of times. Exceptions that occur during the last attempt are + not caught and instead passed on upwards. + """ + def decorator(f: AWrapped) -> AWrapped: async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: for _ in range(attempts - 1): @@ -221,7 +233,7 @@ class Crawler(ABC): path, mtime, redownload, on_conflict) async def cleanup(self) -> None: - await self._output_dir.cleanup() + self._output_dir.cleanup() async def run(self) -> None: """ From 595ba8b7ab601c90b930f36c4c63a194deac8fb8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 10 May 2021 23:47:46 +0200 Subject: [PATCH 092/524] Remove dummy crawler --- PFERD/crawlers/__init__.py | 2 -- PFERD/crawlers/dummy.py | 59 -------------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 PFERD/crawlers/dummy.py diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 15ef403..bf88a2a 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -3,10 +3,8 @@ from typing import Callable, Dict from ..config import Config from ..crawler import Crawler, CrawlerSection -from .dummy import DummyCrawler from .local import LocalCrawler, LocalCrawlerSection CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { - "dummy": lambda n, c, s: DummyCrawler(n, c, CrawlerSection(s)), "local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)), } diff --git a/PFERD/crawlers/dummy.py b/PFERD/crawlers/dummy.py deleted file mode 100644 index 204b4b1..0000000 --- a/PFERD/crawlers/dummy.py +++ /dev/null @@ -1,59 +0,0 @@ -import asyncio -import random -from pathlib import PurePath -from typing import Any - -from rich.markup import escape - -from ..crawler import Crawler -from ..utils import ainput - -DUMMY_TREE = { - "Blätter": { - "Blatt_01.pdf": (), - "Blatt_02.pdf": (), - "Blatt_03.pdf": (), - "Blatt_04.pdf": (), - "Blatt_05.pdf": (), - "Lösungen": { - "Blatt_01_Lösung.pdf": (), - "Blatt_02_Lösung.pdf": (), - "Blatt_03_Lösung.pdf": True, - "Blatt_04_Lösung.pdf": (), - "Blatt_05_Lösung.pdf": (), - }, - }, - "Vorlesungsfolien": { - "VL_01.pdf": (), - "VL_02.pdf": (), - "VL_03.pdf": (), - "VL_04.pdf": (), - "VL_05.pdf": (), - }, - "noch_mehr.txt": (), - "dateien.jar": (), -} - - -class DummyCrawler(Crawler): - async def crawl(self) -> None: - await self._crawl_entry(PurePath(), DUMMY_TREE) - - async def _crawl_entry(self, path: PurePath, value: Any) -> None: - if value is True: - async with self.exclusive_output(): - await ainput(f"File {path}, please press enter: ") - if value == () or value is True: - n = random.randint(5, 20) - async with self.download_bar(path, n) as bar: - await asyncio.sleep(random.random() / 2) - for i in range(n): - await asyncio.sleep(0.5) - bar.advance() - self.print(f"[green]Downloaded {escape(str(path))}") - else: - t = random.random() * 2 + 1 - async with self.crawl_bar(path) as bar: - await asyncio.sleep(t) - tasks = [self._crawl_entry(path / k, v) for k, v in value.items()] - await asyncio.gather(*tasks) From d5f29f01c59c5992e715eda49254b10c964771fc Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 10 May 2021 23:50:16 +0200 Subject: [PATCH 093/524] Use global conductor instance The switch from crawler-local conductors to a single pferd-global conductor was made to prepare for auth section credential providers. --- PFERD/conductor.py | 2 +- PFERD/config.py | 4 ++++ PFERD/crawler.py | 11 +++++------ PFERD/crawlers/__init__.py | 15 ++++++++++++--- PFERD/crawlers/local.py | 6 ++++-- PFERD/pferd.py | 9 ++++++++- 6 files changed, 34 insertions(+), 13 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 76d0e2a..4648e77 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -14,7 +14,7 @@ class ProgressBar: def advance(self, amount: float = 1) -> None: self._progress.advance(self._taskid, advance=amount) - def set_total(self, total) -> None: + def set_total(self, total: float) -> None: self._progress.update(self._taskid, total=total) diff --git a/PFERD/config.py b/PFERD/config.py index f2abe8d..f63922b 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -23,6 +23,10 @@ class ConfigFormatException(Exception): class Section: + """ + Base class for the crawler and auth section classes. + """ + def __init__(self, section: SectionProxy): self.s = section diff --git a/PFERD/crawler.py b/PFERD/crawler.py index b8e9d7c..4bcfe65 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -4,7 +4,7 @@ from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, - Callable, Optional, Protocol, TypeVar) + Callable, Optional, TypeVar) from rich.markup import escape @@ -141,8 +141,9 @@ class Crawler(ABC): def __init__( self, name: str, - config: Config, section: CrawlerSection, + config: Config, + conductor: TerminalConductor, ) -> None: """ Initialize a crawler from its name and its section in the config file. @@ -154,9 +155,9 @@ class Crawler(ABC): """ self.name = name - - self._conductor = TerminalConductor() + self._conductor = conductor self._limiter = Limiter() + self.error_free = True try: self._transformer = Transformer(section.transform()) @@ -171,8 +172,6 @@ class Crawler(ABC): self._conductor, ) - self.error_free = False - def print(self, text: str) -> None: """ Print rich markup to the terminal. Crawlers *must* use this function to diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index bf88a2a..aa049b9 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -1,10 +1,19 @@ from configparser import SectionProxy from typing import Callable, Dict +from ..conductor import TerminalConductor from ..config import Config -from ..crawler import Crawler, CrawlerSection +from ..crawler import Crawler from .local import LocalCrawler, LocalCrawlerSection -CRAWLERS: Dict[str, Callable[[str, Config, SectionProxy], Crawler]] = { - "local": lambda n, c, s: LocalCrawler(n, c, LocalCrawlerSection(s)), +CrawlerConstructor = Callable[[ + str, # Name (without the "crawl:" prefix) + SectionProxy, # Crawler's section of global config + Config, # Global config + TerminalConductor, # Global conductor instance +], Crawler] + +CRAWLERS: Dict[str, CrawlerConstructor] = { + "local": lambda n, s, c, t: + LocalCrawler(n, LocalCrawlerSection(s), c, t), } diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 40cc233..8501877 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -1,6 +1,7 @@ import asyncio from pathlib import Path, PurePath +from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler, CrawlerSection, anoncritical @@ -17,10 +18,11 @@ class LocalCrawler(Crawler): def __init__( self, name: str, - config: Config, section: LocalCrawlerSection, + config: Config, + conductor: TerminalConductor, ): - super().__init__(name, config, section) + super().__init__(name, section, config, conductor) self._path = config.working_dir / section.path() diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 7cdbfa0..c7cd695 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,6 +3,7 @@ from typing import Dict from rich import print from rich.markup import escape +from .conductor import TerminalConductor from .config import Config from .crawler import Crawler from .crawlers import CRAWLERS @@ -15,6 +16,7 @@ class PferdLoadException(Exception): class Pferd: def __init__(self, config: Config): self._config = config + self._conductor = TerminalConductor() self._crawlers: Dict[str, Crawler] = {} def _load_crawlers(self) -> None: @@ -29,7 +31,12 @@ class Pferd: print(f"[red]Error: Unknown crawler type {t}") continue - crawler = crawler_constructor(name, self._config, section) + crawler = crawler_constructor( + name, + section, + self._config, + self._conductor, + ) self._crawlers[name] = crawler if abort: From 0459ed093eac4927bbc570fadbcdf949726713de Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 11 May 2021 00:27:43 +0200 Subject: [PATCH 094/524] Add simple authenticator ... including some required authenticator infrastructure --- PFERD/authenticator.py | 52 ++++++++++++++++++++++++++++++++ PFERD/authenticators/__init__.py | 19 ++++++++++++ PFERD/authenticators/simple.py | 48 +++++++++++++++++++++++++++++ PFERD/config.py | 9 ++++++ PFERD/pferd.py | 27 +++++++++++++++++ 5 files changed, 155 insertions(+) create mode 100644 PFERD/authenticator.py create mode 100644 PFERD/authenticators/__init__.py create mode 100644 PFERD/authenticators/simple.py diff --git a/PFERD/authenticator.py b/PFERD/authenticator.py new file mode 100644 index 0000000..42d8bb9 --- /dev/null +++ b/PFERD/authenticator.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Tuple + +from .conductor import TerminalConductor +from .config import Config, Section + + +class AuthLoadException(Exception): + pass + + +class AuthException(Exception): + pass + + +class AuthSection(Section): + pass + + +class Authenticator(ABC): + def __init__( + self, + name: str, + section: AuthSection, + config: Config, + conductor: TerminalConductor, + ) -> None: + """ + Initialize an authenticator from its name and its section in the config + file. + + If you are writing your own constructor for your own authenticator, + make sure to call this constructor first (via super().__init__). + + May throw an AuthLoadException. + """ + + self.name = name + self.conductor = conductor + + @abstractmethod + async def credentials(self) -> Tuple[str, str]: + pass + + def invalid_credentials(self) -> None: + raise AuthException("Invalid credentials") + + def invalid_username(self) -> None: + raise AuthException("Invalid username") + + def invalid_password(self) -> None: + raise AuthException("Invalid password") diff --git a/PFERD/authenticators/__init__.py b/PFERD/authenticators/__init__.py new file mode 100644 index 0000000..d021d40 --- /dev/null +++ b/PFERD/authenticators/__init__.py @@ -0,0 +1,19 @@ +from configparser import SectionProxy +from typing import Callable, Dict + +from ..authenticator import Authenticator +from ..conductor import TerminalConductor +from ..config import Config +from .simple import SimpleAuthenticator, SimpleAuthSection + +AuthConstructor = Callable[[ + str, # Name (without the "auth:" prefix) + SectionProxy, # Authenticator's section of global config + Config, # Global config + TerminalConductor, # Global conductor instance +], Authenticator] + +AUTHENTICATORS: Dict[str, AuthConstructor] = { + "simple": lambda n, s, c, t: + SimpleAuthenticator(n, SimpleAuthSection(s), c, t), +} diff --git a/PFERD/authenticators/simple.py b/PFERD/authenticators/simple.py new file mode 100644 index 0000000..3a57faf --- /dev/null +++ b/PFERD/authenticators/simple.py @@ -0,0 +1,48 @@ +from typing import Optional, Tuple + +from ..authenticator import Authenticator, AuthSection +from ..conductor import TerminalConductor +from ..config import Config +from ..utils import agetpass, ainput + + +class SimpleAuthSection(AuthSection): + def username(self) -> Optional[str]: + return self.s.get("username") + + def password(self) -> Optional[str]: + return self.s.get("password") + + +class SimpleAuthenticator(Authenticator): + def __init__( + self, + name: str, + section: SimpleAuthSection, + config: Config, + conductor: TerminalConductor, + ) -> None: + super().__init__(name, section, config, conductor) + + self.username = section.username() + self.password = section.password() + + self.username_fixed = self.username is not None + self.password_fixed = self.password is not None + + async def credentials(self) -> Tuple[str, str]: + if self.username is not None and self.password is not None: + return self.username, self.password + + async with self.conductor.exclusive_output(): + if self.username is None: + self.username = await ainput("Username: ") + else: + print(f"Username: {self.username}") + + if self.password is None: + self.password = await agetpass("Password: ") + else: + print("Password: *******") + + return self.username, self.password diff --git a/PFERD/config.py b/PFERD/config.py index f63922b..56ea9af 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -138,6 +138,15 @@ class Config: return result + def authenticator_sections(self) -> List[Tuple[str, SectionProxy]]: + result = [] + for section_name, section_proxy in self._parser.items(): + if section_name.startswith("auth:"): + crawler_name = section_name[5:] + result.append((crawler_name, section_proxy)) + + return result + @property def working_dir(self) -> Path: pathstr = self.default_section.get("working_dir", ".") diff --git a/PFERD/pferd.py b/PFERD/pferd.py index c7cd695..fb411fb 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,6 +3,8 @@ from typing import Dict from rich import print from rich.markup import escape +from .authenticator import Authenticator +from .authenticators import AUTHENTICATORS from .conductor import TerminalConductor from .config import Config from .crawler import Crawler @@ -17,8 +19,32 @@ class Pferd: def __init__(self, config: Config): self._config = config self._conductor = TerminalConductor() + self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} + def _load_authenticators(self) -> None: + abort = False + for name, section in self._config.authenticator_sections(): + print(f"[bold bright_cyan]Loading[/] auth:{escape(name)}") + authenticator_type = section.get("type") + authenticator_constructor = AUTHENTICATORS.get(authenticator_type) + if authenticator_constructor is None: + abort = True + t = escape(repr(authenticator_type)) + print(f"[red]Error: Unknown authenticator type {t}") + continue + + authenticator = authenticator_constructor( + name, + section, + self._config, + self._conductor, + ) + self._authenticators[name] = authenticator + + if abort: + raise PferdLoadException() + def _load_crawlers(self) -> None: abort = False for name, section in self._config.crawler_sections(): @@ -44,6 +70,7 @@ class Pferd: async def run(self) -> None: try: + self._load_authenticators() self._load_crawlers() except PferdLoadException: print("[bold red]Could not initialize PFERD properly") From c3ce6bb31ca4aa17b94a50c044628d99bd01270c Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 11 May 2021 00:28:45 +0200 Subject: [PATCH 095/524] Fix crawler cleanup not being awaited --- PFERD/crawlers/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 8501877..e80472e 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -29,7 +29,7 @@ class LocalCrawler(Crawler): async def crawl(self) -> None: await self._crawl_path(self._path, PurePath()) if self.error_free: - self.cleanup() + await self.cleanup() @anoncritical async def _crawl_path(self, path: Path, pure: PurePath) -> None: From 0acdee15a0987bef6f8de8105404bedf414bee72 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 18:57:20 +0200 Subject: [PATCH 096/524] Let crawlers obtain authenticators --- PFERD/crawler.py | 12 +++++++++++- PFERD/crawlers/__init__.py | 12 +++++++----- PFERD/pferd.py | 1 + 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4bcfe65..5148d9d 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -4,10 +4,11 @@ from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, - Callable, Optional, TypeVar) + Callable, Dict, Optional, TypeVar) from rich.markup import escape +from .authenticator import Authenticator from .conductor import ProgressBar, TerminalConductor from .config import Config, Section from .limiter import Limiter @@ -136,6 +137,15 @@ class CrawlerSection(Section): def transform(self) -> str: return self.s.get("transform", "") + def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: + value = self.s.get("auth") + if value is None: + self.missing_value("auth") + auth = authenticators.get(f"auth:{value}") + if auth is None: + self.invalid_value("auth", value) + return auth + class Crawler(ABC): def __init__( diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index aa049b9..b2e5af5 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -1,19 +1,21 @@ from configparser import SectionProxy from typing import Callable, Dict +from ..authenticator import Authenticator from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler from .local import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ - str, # Name (without the "crawl:" prefix) - SectionProxy, # Crawler's section of global config - Config, # Global config - TerminalConductor, # Global conductor instance + str, # Name (without the "crawl:" prefix) + SectionProxy, # Crawler's section of global config + Config, # Global config + TerminalConductor, # Global conductor instance + Dict[str, Authenticator], # Loaded authenticators by name ], Crawler] CRAWLERS: Dict[str, CrawlerConstructor] = { - "local": lambda n, s, c, t: + "local": lambda n, s, c, t, a: LocalCrawler(n, LocalCrawlerSection(s), c, t), } diff --git a/PFERD/pferd.py b/PFERD/pferd.py index fb411fb..4500ba9 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -62,6 +62,7 @@ class Pferd: section, self._config, self._conductor, + self._authenticators, ) self._crawlers[name] = crawler From 6bd6adb9771514cdeb17786762854db77a03463b Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:36:46 +0200 Subject: [PATCH 097/524] Fix tmp file names --- PFERD/output_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c875574..08c01a3 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -228,7 +228,7 @@ class OutputDirectory: def _tmp_path(self, base: Path, suffix_length: int) -> Path: prefix = "" if base.name.startswith(".") else "." - suffix = random.choices(SUFFIX_CHARS, k=suffix_length) + suffix = "".join(random.choices(SUFFIX_CHARS, k=suffix_length)) name = f"{prefix}{base.name}.tmp.{suffix}" return base.parent / name From 910462bb721cd66997361da4a153d2e2a8d59d48 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:37:27 +0200 Subject: [PATCH 098/524] Log stuff happening to files --- PFERD/output_dir.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 08c01a3..18e0b6a 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -11,6 +11,8 @@ from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import AsyncContextManager, AsyncIterator, BinaryIO, Optional +from rich.markup import escape + from .conductor import TerminalConductor from .report import MarkConflictException, MarkDuplicateException, Report from .utils import prompt_yes_no @@ -330,8 +332,12 @@ class OutputDirectory: info.tmp_path.replace(info.local_path) if changed: + self._conductor.print( + f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") self._report.change_file(info.path) else: + self._conductor.print( + f"[bold bright_green]Added[/] {escape(str(info.path))}") self._report.add_file(info.path) def cleanup(self) -> None: @@ -360,6 +366,8 @@ class OutputDirectory: if self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() + self._conductor.print( + f"[bold bright_magenta]Deleted[/] {escape(str(path))}") self._report.delete_file(pure) except OSError: pass From 68781a88ab607060e909d8985b436c4de0ce4779 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:39:49 +0200 Subject: [PATCH 099/524] Fix asynchronous methods being not awaited --- PFERD/crawler.py | 2 +- PFERD/output_dir.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 5148d9d..da35801 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -242,7 +242,7 @@ class Crawler(ABC): path, mtime, redownload, on_conflict) async def cleanup(self) -> None: - self._output_dir.cleanup() + await self._output_dir.cleanup() async def run(self) -> None: """ diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 18e0b6a..635ee43 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -340,30 +340,30 @@ class OutputDirectory: f"[bold bright_green]Added[/] {escape(str(info.path))}") self._report.add_file(info.path) - def cleanup(self) -> None: - self._cleanup_dir(self._root, PurePath()) + async def cleanup(self) -> None: + await self._cleanup_dir(self._root, PurePath()) - def _cleanup(self, path: Path, pure: PurePath) -> None: + async def _cleanup(self, path: Path, pure: PurePath) -> None: if path.is_dir(): - self._cleanup_dir(path, pure) + await self._cleanup_dir(path, pure) elif path.is_file(): - self._cleanup_file(path, pure) + await self._cleanup_file(path, pure) - def _cleanup_dir(self, path: Path, pure: PurePath) -> None: + async def _cleanup_dir(self, path: Path, pure: PurePath) -> None: for child in path.iterdir(): pure_child = pure / child.name - self._cleanup(child, pure_child) + await self._cleanup(child, pure_child) try: path.rmdir() except OSError: pass - def _cleanup_file(self, path: Path, pure: PurePath) -> None: + async def _cleanup_file(self, path: Path, pure: PurePath) -> None: if self._report.marked(pure): return - if self._conflict_delete_lf(self._on_conflict, pure): + if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() self._conductor.print( From 38bb66a776ef18070e7d46f4daeee35acb8c3e36 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:40:10 +0200 Subject: [PATCH 100/524] Update file metadata in more cases PFERD now not only updates file metadata when a file is successfully added or changed, but also when a file is downloaded and then detected to be unchanged. This could occur for example if a remote file's modification time was bumped, possibly because somebody touched the file without changing it. --- PFERD/output_dir.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 635ee43..571d73d 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -303,6 +303,11 @@ class OutputDirectory: return None + def _update_metadata(self, info: DownloadInfo) -> None: + if mtime := info.heuristics.mtime: + mtimestamp = mtime.timestamp() + os.utime(info.local_path, times=(mtimestamp, mtimestamp)) + async def _after_download(self, info: DownloadInfo) -> None: changed = False @@ -314,6 +319,7 @@ class OutputDirectory: if info.local_path.exists(): changed = True if filecmp.cmp(info.local_path, info.tmp_path): + self._update_metadata(info) info.tmp_path.unlink() return @@ -321,15 +327,8 @@ class OutputDirectory: info.tmp_path.unlink() return - # Modify metadata if necessary - if mtime := info.heuristics.mtime: - # TODO Pick an implementation - # Rounding up to avoid inaccuracies in how the OS stores timestamps - # mtimestamp = math.ceil(mtime.timestamp()) - mtimestamp = mtime.timestamp() - os.utime(info.tmp_path, times=(mtimestamp, mtimestamp)) - info.tmp_path.replace(info.local_path) + self._update_metadata(info) if changed: self._conductor.print( From 94d6a01ccab6c58144e864fea7b8e77ada6a61a4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:42:40 +0200 Subject: [PATCH 101/524] Use file mtime in local crawler --- PFERD/crawlers/local.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index e80472e..fb08cc9 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -1,4 +1,5 @@ import asyncio +import datetime from pathlib import Path, PurePath from ..conductor import TerminalConductor @@ -48,12 +49,14 @@ class LocalCrawler(Crawler): async def _crawl_file(self, path: Path, pure: PurePath) -> None: async with self.download_bar(path) as bar: - bar.set_total(path.stat().st_size) - - dl = await self.download(pure) + stat = path.stat() + mtime = datetime.datetime.fromtimestamp(stat.st_mtime) + dl = await self.download(pure, mtime=mtime) if not dl: return + bar.set_total(stat.st_size) + async with dl as sink: with open(path, "rb") as f: while True: From e3ee4e515df08d0a7abeced81ef0cf4468abea6d Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:47:44 +0200 Subject: [PATCH 102/524] Disable highlighting of primitives This commit prevents rich from highlighting python-looking syntax like numbers, arrays, 'None' etc. --- PFERD/conductor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 4648e77..5022a22 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -3,6 +3,7 @@ from contextlib import asynccontextmanager, contextmanager from types import TracebackType from typing import AsyncIterator, Iterator, List, Optional, Type +from rich.console import Console from rich.progress import Progress, TaskID @@ -22,9 +23,11 @@ class TerminalConductor: def __init__(self) -> None: self._stopped = False self._lock = asyncio.Lock() - self._progress = Progress() self._lines: List[str] = [] + self._console = Console(highlight=False) + self._progress = Progress(console=self._console) + async def _start(self) -> None: for task in self._progress.tasks: task.visible = True @@ -61,7 +64,7 @@ class TerminalConductor: if self._stopped: self._lines.append(line) else: - self._progress.console.print(line) + self._console.print(line) @asynccontextmanager async def exclusive_output(self) -> AsyncIterator[None]: From 961f40f9a10d126a7c9a241b29dfcfac3b9ede10 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 19:55:04 +0200 Subject: [PATCH 103/524] Document simple authenticator --- CONFIG.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 16c8531..92c36ae 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -104,11 +104,18 @@ authenticators is `type`: This crawler crawls a local directory. It is really simple and mostly useful for testing different setups. -- `path`: Path to the local directory to crawl. (No default, must be specified) +- `path`: Path to the local directory to crawl. (Required) ## Authenticator types -TODO Fill in as authenticators are implemented +### The `simple` authenticator + +With this authenticator, the username and password can be set directly in the +config file. If the username or password are not specified, the user is prompted +via the terminal. + +- `username`: The username (Optional) +- `password`: The password (Optional) ## Transformation rules From d565df27b31f5a7e635edc6d069d80cf65b1c3ef Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 22:28:14 +0200 Subject: [PATCH 104/524] Add HttpCrawler --- PFERD/crawler.py | 37 +++++++++++++++++++++++++++++++++++++ PFERD/output_dir.py | 5 ++++- PFERD/report.py | 12 ++++++++---- setup.cfg | 2 ++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index da35801..feb3f25 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -6,6 +6,7 @@ from pathlib import Path, PurePath from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Dict, Optional, TypeVar) +import aiohttp from rich.markup import escape from .authenticator import Authenticator @@ -263,3 +264,39 @@ class Crawler(ABC): """ pass + + +class HttpCrawler(Crawler): + COOKIE_FILE = PurePath(".cookies") + + def __init__( + self, + name: str, + section: CrawlerSection, + config: Config, + conductor: TerminalConductor, + ) -> None: + super().__init__(name, section, config, conductor) + + self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) + self._output_dir.register_reserved(self.COOKIE_FILE) + + async def run(self) -> None: + cookie_jar = aiohttp.CookieJar() + + try: + cookie_jar.load(self._cookie_jar_path) + except Exception: + pass + + async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session: + self.session = session + try: + await super().run() + finally: + del self.session + + try: + cookie_jar.save(self._cookie_jar_path) + except Exception: + self.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 571d73d..1be9a16 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -86,6 +86,9 @@ class OutputDirectory: self._report = Report() + def register_reserved(self, path: PurePath): + self._report.mark_reserved(path) + def _mark(self, path: PurePath) -> None: """ May throw an OutputDirException @@ -100,7 +103,7 @@ class OutputDirectory: msg = f"Collides with other file: {e.collides_with}" raise OutputDirException(msg) - def _resolve(self, path: PurePath) -> Path: + def resolve(self, path: PurePath) -> Path: """ May throw an OutputDirException. """ diff --git a/PFERD/report.py b/PFERD/report.py index b98c90c..2c7d8af 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -44,12 +44,16 @@ class Report: """ def __init__(self) -> None: + self.reserved_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set() self.new_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + def mark_reserved(self, path: PurePath) -> None: + self.reserved_files.add(path) + def mark(self, path: PurePath) -> None: """ Mark a previously unknown file as known. @@ -58,12 +62,12 @@ class Report: detail, see the respective exception's docstring. """ - for known_path in self.known_files: - if path == known_path: + for other in self.known_files & self.reserved_files: + if path == other: raise MarkDuplicateException(path) - if is_relative_to(path, known_path) or is_relative_to(known_path, path): - raise MarkConflictException(path, known_path) + if is_relative_to(path, other) or is_relative_to(other, path): + raise MarkConflictException(path, other) self.known_files.add(path) diff --git a/setup.cfg b/setup.cfg index 1c6e764..9dcb111 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,6 +6,8 @@ version = 3.0.0 packages = PFERD python_requires = >=3.8 install_requires = + aiohttp>=3.7.4.post0 + beautifulsoup4>=4.9.3 rich>=10.1.0 [options.entry_points] From 93a5a94dab50e916ed13d28b55d5ba584a288b3d Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 13 May 2021 23:52:46 +0200 Subject: [PATCH 105/524] Single-source version number --- PFERD/__init__.py | 43 ------------------------------------------- PFERD/__main__.py | 43 +++++++++++++++++++++++++++++++++++++++++++ PFERD/version.py | 1 + setup.cfg | 4 ++-- 4 files changed, 46 insertions(+), 45 deletions(-) create mode 100644 PFERD/__main__.py create mode 100644 PFERD/version.py diff --git a/PFERD/__init__.py b/PFERD/__init__.py index a16b19b..e69de29 100644 --- a/PFERD/__init__.py +++ b/PFERD/__init__.py @@ -1,43 +0,0 @@ -import argparse -import asyncio -from pathlib import Path - -from .config import Config, ConfigDumpException, ConfigLoadException -from .pferd import Pferd - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "--config", "-c", - type=Path, - metavar="PATH", - help="specify custom config file path", - ) - parser.add_argument( - "--dump-config", - nargs="?", - const=True, - type=Path, - metavar="PATH", - help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified", - ) - args = parser.parse_args() - - try: - config_parser = Config.load_parser(args.config) - config = Config(config_parser) - except ConfigLoadException: - exit(1) - - if args.dump_config: - path = None if args.dump_config is True else args.dump_config - try: - config.dump(path) - except ConfigDumpException: - exit(1) - exit() - - pferd = Pferd(config) - asyncio.run(pferd.run()) diff --git a/PFERD/__main__.py b/PFERD/__main__.py new file mode 100644 index 0000000..a16b19b --- /dev/null +++ b/PFERD/__main__.py @@ -0,0 +1,43 @@ +import argparse +import asyncio +from pathlib import Path + +from .config import Config, ConfigDumpException, ConfigLoadException +from .pferd import Pferd + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="specify custom config file path", + ) + parser.add_argument( + "--dump-config", + nargs="?", + const=True, + type=Path, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified", + ) + args = parser.parse_args() + + try: + config_parser = Config.load_parser(args.config) + config = Config(config_parser) + except ConfigLoadException: + exit(1) + + if args.dump_config: + path = None if args.dump_config is True else args.dump_config + try: + config.dump(path) + except ConfigDumpException: + exit(1) + exit() + + pferd = Pferd(config) + asyncio.run(pferd.run()) diff --git a/PFERD/version.py b/PFERD/version.py new file mode 100644 index 0000000..528787c --- /dev/null +++ b/PFERD/version.py @@ -0,0 +1 @@ +__version__ = "3.0.0" diff --git a/setup.cfg b/setup.cfg index 9dcb111..f2806e2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = PFERD -version = 3.0.0 +version = attr: PFERD.version.__version__ [options] packages = PFERD @@ -12,4 +12,4 @@ install_requires = [options.entry_points] console_scripts = - pferd = PFERD:main + pferd = PFERD.__main__:main From 6e5fdf4e9ee05eb22345a895056509fbdfaa9dda Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 14 May 2021 00:09:58 +0200 Subject: [PATCH 106/524] Set user agent to "pferd/" --- PFERD/crawler.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index feb3f25..ece62c1 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -15,6 +15,7 @@ from .config import Config, Section from .limiter import Limiter from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer +from .version import __version__ class CrawlerLoadException(Exception): @@ -289,7 +290,10 @@ class HttpCrawler(Crawler): except Exception: pass - async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session: + async with aiohttp.ClientSession( + headers={"User-Agent": f"pferd/{__version__}"}, + cookie_jar=cookie_jar, + ) as session: self.session = session try: await super().run() @@ -299,4 +303,7 @@ class HttpCrawler(Crawler): try: cookie_jar.save(self._cookie_jar_path) except Exception: - self.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") + self.print( + "[bold red]Warning:[/] Failed to save cookies to " + + escape(str(self.COOKIE_FILE)) + ) From a673ab0fae35c926c4f24e1c117fa0716b704d0b Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 14 May 2021 00:20:59 +0200 Subject: [PATCH 107/524] Delete old files I should've done this earlier --- PFERD/authenticators.py | 214 ----------- PFERD/cookie_jar.py | 69 ---- PFERD/diva.py | 169 --------- PFERD/download_summary.py | 75 ---- PFERD/downloaders.py | 72 ---- PFERD/errors.py | 57 --- PFERD/ilias/__init__.py | 10 - PFERD/ilias/authenticators.py | 138 ------- PFERD/ilias/crawler.py | 684 ---------------------------------- PFERD/ilias/date_demangler.py | 51 --- PFERD/ilias/downloader.py | 173 --------- PFERD/ipd.py | 154 -------- PFERD/location.py | 41 -- PFERD/logging.py | 184 --------- PFERD/organizer.py | 224 ----------- PFERD/progress.py | 111 ------ PFERD/tmp_dir.py | 79 ---- PFERD/transform.py | 142 ------- 18 files changed, 2647 deletions(-) delete mode 100644 PFERD/authenticators.py delete mode 100644 PFERD/cookie_jar.py delete mode 100644 PFERD/diva.py delete mode 100644 PFERD/download_summary.py delete mode 100644 PFERD/downloaders.py delete mode 100644 PFERD/errors.py delete mode 100644 PFERD/ilias/__init__.py delete mode 100644 PFERD/ilias/authenticators.py delete mode 100644 PFERD/ilias/crawler.py delete mode 100644 PFERD/ilias/date_demangler.py delete mode 100644 PFERD/ilias/downloader.py delete mode 100644 PFERD/ipd.py delete mode 100644 PFERD/location.py delete mode 100644 PFERD/logging.py delete mode 100644 PFERD/organizer.py delete mode 100644 PFERD/progress.py delete mode 100644 PFERD/tmp_dir.py delete mode 100644 PFERD/transform.py diff --git a/PFERD/authenticators.py b/PFERD/authenticators.py deleted file mode 100644 index f85c9d3..0000000 --- a/PFERD/authenticators.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -General authenticators useful in many situations -""" - -import getpass -import logging -from typing import Optional, Tuple - -from .logging import PrettyLogger - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - -try: - import keyring -except ImportError: - pass - - -class TfaAuthenticator: - # pylint: disable=too-few-public-methods - """ - An authenticator for a TFA token. Always prompts the user, as the token can not be cached. - """ - - def __init__(self, reason: str): - """ - Create a new tfa authenticator. - - Arguments: - reason {str} -- the reason for obtaining the credentials - """ - self._reason = reason - - def get_token(self) -> str: - # pylint: disable=no-self-use - """ - Prompts the user for the token and returns it. - """ - print(f"Enter credentials ({self._reason})") - return getpass.getpass("TFA Token: ") - - -class UserPassAuthenticator: - """ - An authenticator for username-password combinations that prompts the user - for missing information. - """ - - def __init__( - self, - reason: str, - username: Optional[str] = None, - password: Optional[str] = None, - ) -> None: - """ - reason - what the credentials are used for - username - the username (if already known) - password - the password (if already known) - """ - - self._reason = reason - - self._given_username = username - self._given_password = password - - self._username = username - self._password = password - - def get_credentials(self) -> Tuple[str, str]: - """ - Returns a tuple (username, password). Prompts user for username or - password when necessary. - """ - - if self._username is None and self._given_username is not None: - self._username = self._given_username - - if self._password is None and self._given_password is not None: - self._password = self._given_password - - if self._username is None or self._password is None: - print(f"Enter credentials ({self._reason})") - - username: str - if self._username is None: - username = input("Username: ") - self._username = username - else: - username = self._username - - password: str - if self._password is None: - password = getpass.getpass(prompt="Password: ") - self._password = password - else: - password = self._password - - return (username, password) - - @property - def username(self) -> str: - """ - The username. Accessing this property may cause the authenticator to - prompt the user. - """ - - (username, _) = self.get_credentials() - return username - - @property - def password(self) -> str: - """ - The password. Accessing this property may cause the authenticator to - prompt the user. - """ - - (_, password) = self.get_credentials() - return password - - def invalidate_credentials(self) -> None: - """ - Marks the credentials as invalid. If only a username was supplied in - the constructor, assumes that the username is valid and only the - password is invalid. If only a password was supplied in the - constructor, assumes that the password is valid and only the username - is invalid. Otherwise, assumes that username and password are both - invalid. - """ - - self._username = None - self._password = None - - if self._given_username is not None and self._given_password is not None: - self._given_username = None - self._given_password = None - - -class KeyringAuthenticator(UserPassAuthenticator): - """ - An authenticator for username-password combinations that stores the - password using the system keyring service and prompts the user for missing - information. - """ - - def get_credentials(self) -> Tuple[str, str]: - """ - Returns a tuple (username, password). Prompts user for username or - password when necessary. - """ - - if self._username is None and self._given_username is not None: - self._username = self._given_username - - if self._password is None and self._given_password is not None: - self._password = self._given_password - - if self._username is not None and self._password is None: - self._load_password() - - if self._username is None or self._password is None: - print(f"Enter credentials ({self._reason})") - - username: str - if self._username is None: - username = input("Username: ") - self._username = username - else: - username = self._username - - if self._password is None: - self._load_password() - - password: str - if self._password is None: - password = getpass.getpass(prompt="Password: ") - self._password = password - self._save_password() - else: - password = self._password - - return (username, password) - - def _load_password(self) -> None: - """ - Loads the saved password associated with self._username from the system - keyring service (or None if not password has been saved yet) and stores - it in self._password. - """ - self._password = keyring.get_password("pferd-ilias", self._username) - - def _save_password(self) -> None: - """ - Saves self._password to the system keyring service and associates it - with self._username. - """ - keyring.set_password("pferd-ilias", self._username, self._password) - - def invalidate_credentials(self) -> None: - """ - Marks the credentials as invalid. If only a username was supplied in - the constructor, assumes that the username is valid and only the - password is invalid. If only a password was supplied in the - constructor, assumes that the password is valid and only the username - is invalid. Otherwise, assumes that username and password are both - invalid. - """ - - try: - keyring.delete_password("pferd-ilias", self._username) - except keyring.errors.PasswordDeleteError: - pass - - super().invalidate_credentials() diff --git a/PFERD/cookie_jar.py b/PFERD/cookie_jar.py deleted file mode 100644 index e5b568f..0000000 --- a/PFERD/cookie_jar.py +++ /dev/null @@ -1,69 +0,0 @@ -"""A helper for requests cookies.""" - -import logging -from http.cookiejar import LoadError, LWPCookieJar -from pathlib import Path -from typing import Optional - -import requests - -LOGGER = logging.getLogger(__name__) - - -class CookieJar: - """A cookie jar that can be persisted.""" - - def __init__(self, cookie_file: Optional[Path] = None) -> None: - """Create a new cookie jar at the given path. - - If the path is None, the cookies will not be persisted. - """ - self._cookies: LWPCookieJar - if cookie_file is None: - self._cookies = LWPCookieJar() - else: - self._cookies = LWPCookieJar(str(cookie_file.resolve())) - - @property - def cookies(self) -> LWPCookieJar: - """Return the requests cookie jar.""" - return self._cookies - - def load_cookies(self) -> None: - """Load all cookies from the file given in the constructor.""" - if self._cookies.filename is None: - return - - try: - LOGGER.info("Loading old cookies from %s", self._cookies.filename) - self._cookies.load(ignore_discard=True) - except (FileNotFoundError, LoadError): - LOGGER.warning( - "No valid cookie file found at %s, continuing with no cookies", - self._cookies.filename - ) - - def save_cookies(self, reason: Optional[str] = None) -> None: - """Save the cookies in the file given in the constructor.""" - if self._cookies.filename is None: - return - - if reason is None: - LOGGER.info("Saving cookies") - else: - LOGGER.info("Saving cookies (%s)", reason) - - # TODO figure out why ignore_discard is set - # TODO possibly catch a few more exceptions - self._cookies.save(ignore_discard=True) - - def create_session(self) -> requests.Session: - """Create a new session using the cookie jar.""" - sess = requests.Session() - - # From the request docs: "All requests code should work out of the box - # with externally provided instances of CookieJar, e.g. LWPCookieJar - # and FileCookieJar." - sess.cookies = self.cookies # type: ignore - - return sess diff --git a/PFERD/diva.py b/PFERD/diva.py deleted file mode 100644 index 148fa56..0000000 --- a/PFERD/diva.py +++ /dev/null @@ -1,169 +0,0 @@ -""" -Utility functions and a scraper/downloader for the KIT DIVA portal. -""" -import logging -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Callable, List, Optional - -import requests - -from .errors import FatalException -from .logging import PrettyLogger -from .organizer import Organizer -from .tmp_dir import TmpDir -from .transform import Transformable -from .utils import stream_to_path - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -@dataclass -class DivaDownloadInfo(Transformable): - """ - Information about a DIVA video - """ - url: str - - -DivaDownloadStrategy = Callable[[Organizer, DivaDownloadInfo], bool] - - -def diva_download_new(organizer: Organizer, info: DivaDownloadInfo) -> bool: - """ - Accepts only new files. - """ - resolved_file = organizer.resolve(info.path) - if not resolved_file.exists(): - return True - PRETTY.ignored_file(info.path, "local file exists") - return False - - -class DivaPlaylistCrawler: - # pylint: disable=too-few-public-methods - """ - A crawler for DIVA playlists. - """ - - _PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/" - _COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json" - - def __init__(self, playlist_id: str): - self._id = playlist_id - - @classmethod - def fetch_id(cls, playlist_link: str) -> str: - """ - Fetches the ID for a playerlist, given the base link - (e.g. https://mediaservice.bibliothek.kit.edu/#/details/DIVA-2019-271). - - Raises a FatalException, if the id can not be resolved - """ - match = re.match(r".+#/details/(.+)", playlist_link) - if match is None: - raise FatalException( - "DIVA: Invalid playlist link format, could not extract details." - ) - base_name = match.group(1) - - response = requests.get(cls._PLAYLIST_BASE_URL + base_name + ".json") - - if response.status_code != 200: - raise FatalException( - f"DIVA: Got non-200 status code ({response.status_code}))" - f"when requesting {response.url!r}!" - ) - - body = response.json() - - if body["error"]: - raise FatalException(f"DIVA: Server returned error {body['error']!r}.") - - return body["result"]["collection"]["id"] - - def crawl(self) -> List[DivaDownloadInfo]: - """ - Crawls the playlist given in the constructor. - """ - response = requests.get(self._COLLECTION_BASE_URL, params={"collection": self._id}) - if response.status_code != 200: - raise FatalException(f"Server returned status {response.status_code}.") - - body = response.json() - - if body["error"]: - raise FatalException(f"Server returned error {body['error']!r}.") - - result = body["result"] - - if result["resultCount"] > result["pageSize"]: - PRETTY.warning("Did not receive all results, some will be missing") - - download_infos: List[DivaDownloadInfo] = [] - - for video in result["resultList"]: - title = video["title"] - collection_title = self._follow_path(["collection", "title"], video) - url = self._follow_path( - ["resourceList", "derivateList", "mp4", "url"], - video - ) - - if url and collection_title and title: - path = Path(collection_title, title + ".mp4") - download_infos.append(DivaDownloadInfo(path, url)) - else: - PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}") - - return download_infos - - @staticmethod - def _follow_path(path: List[str], obj: Any) -> Optional[Any]: - """ - Follows a property path through an object, bailing at the first None. - """ - current = obj - for path_step in path: - if path_step in current: - current = current[path_step] - else: - return None - return current - - -class DivaDownloader: - """ - A downloader for DIVA videos. - """ - - def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy): - self._tmp_dir = tmp_dir - self._organizer = organizer - self._strategy = strategy - self._session = requests.session() - - def download_all(self, infos: List[DivaDownloadInfo]) -> None: - """ - Download multiple files one after the other. - """ - for info in infos: - self.download(info) - - def download(self, info: DivaDownloadInfo) -> None: - """ - Download a single file. - """ - if not self._strategy(self._organizer, info): - self._organizer.mark(info.path) - return - - with self._session.get(info.url, stream=True) as response: - if response.status_code == 200: - tmp_file = self._tmp_dir.new_path() - stream_to_path(response, tmp_file, info.path.name) - self._organizer.accept_file(tmp_file, info.path) - else: - PRETTY.warning(f"Could not download file, got response {response.status_code}") diff --git a/PFERD/download_summary.py b/PFERD/download_summary.py deleted file mode 100644 index 3b9a024..0000000 --- a/PFERD/download_summary.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Provides a summary that keeps track of new modified or deleted files. -""" -from pathlib import Path -from typing import List - - -def _mergeNoDuplicate(first: List[Path], second: List[Path]) -> List[Path]: - tmp = list(set(first + second)) - tmp.sort(key=lambda x: str(x.resolve())) - return tmp - - -class DownloadSummary: - """ - Keeps track of all new, modified or deleted files and provides a summary. - """ - - def __init__(self) -> None: - self._new_files: List[Path] = [] - self._modified_files: List[Path] = [] - self._deleted_files: List[Path] = [] - - @property - def new_files(self) -> List[Path]: - """ - Returns all new files. - """ - return self._new_files.copy() - - @property - def modified_files(self) -> List[Path]: - """ - Returns all modified files. - """ - return self._modified_files.copy() - - @property - def deleted_files(self) -> List[Path]: - """ - Returns all deleted files. - """ - return self._deleted_files.copy() - - def merge(self, summary: 'DownloadSummary') -> None: - """ - Merges ourselves with the passed summary. Modifies this object, but not the passed one. - """ - self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files) - self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files) - self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files) - - def add_deleted_file(self, path: Path) -> None: - """ - Registers a file as deleted. - """ - self._deleted_files.append(path) - - def add_modified_file(self, path: Path) -> None: - """ - Registers a file as changed. - """ - self._modified_files.append(path) - - def add_new_file(self, path: Path) -> None: - """ - Registers a file as new. - """ - self._new_files.append(path) - - def has_updates(self) -> bool: - """ - Returns whether this summary has any updates. - """ - return bool(self._new_files or self._modified_files or self._deleted_files) diff --git a/PFERD/downloaders.py b/PFERD/downloaders.py deleted file mode 100644 index 94b8b9f..0000000 --- a/PFERD/downloaders.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -General downloaders useful in many situations -""" - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -import requests -import requests.auth - -from .organizer import Organizer -from .tmp_dir import TmpDir -from .transform import Transformable -from .utils import stream_to_path - - -@dataclass -class HttpDownloadInfo(Transformable): - """ - This class describes a single file to be downloaded. - """ - - url: str - parameters: Dict[str, Any] = field(default_factory=dict) - - -class HttpDownloader: - """A HTTP downloader that can handle HTTP basic auth.""" - - def __init__( - self, - tmp_dir: TmpDir, - organizer: Organizer, - username: Optional[str], - password: Optional[str], - ): - """Create a new http downloader.""" - self._organizer = organizer - self._tmp_dir = tmp_dir - self._username = username - self._password = password - self._session = self._build_session() - - def _build_session(self) -> requests.Session: - session = requests.Session() - if self._username and self._password: - session.auth = requests.auth.HTTPBasicAuth( - self._username, self._password - ) - return session - - def download_all(self, infos: List[HttpDownloadInfo]) -> None: - """ - Download multiple files one after the other. - """ - - for info in infos: - self.download(info) - - def download(self, info: HttpDownloadInfo) -> None: - """ - Download a single file. - """ - - with self._session.get(info.url, params=info.parameters, stream=True) as response: - if response.status_code == 200: - tmp_file = self._tmp_dir.new_path() - stream_to_path(response, tmp_file, info.path.name) - self._organizer.accept_file(tmp_file, info.path) - else: - # TODO use proper exception - raise Exception(f"Could not download file, got response {response.status_code}") diff --git a/PFERD/errors.py b/PFERD/errors.py deleted file mode 100644 index d960e13..0000000 --- a/PFERD/errors.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -An error logging decorator. -""" - -import logging -from typing import Any, Callable, TypeVar, cast - -from rich.console import Console - -from .logging import PrettyLogger - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -class FatalException(Exception): - """ - A fatal exception occurred. Recovery is not possible. - """ - - -TFun = TypeVar('TFun', bound=Callable[..., Any]) - - -def swallow_and_print_errors(function: TFun) -> TFun: - """ - Decorates a function, swallows all errors, logs them and returns none if one occurred. - """ - def inner(*args: Any, **kwargs: Any) -> Any: - # pylint: disable=broad-except - try: - return function(*args, **kwargs) - except FatalException as error: - PRETTY.error(str(error)) - return None - except Exception as error: - Console().print_exception() - return None - return cast(TFun, inner) - - -def retry_on_io_exception(max_retries: int, message: str) -> Callable[[TFun], TFun]: - """ - Decorates a function and retries it on any exception until the max retries count is hit. - """ - def retry(function: TFun) -> TFun: - def inner(*args: Any, **kwargs: Any) -> Any: - for i in range(0, max_retries): - # pylint: disable=broad-except - try: - return function(*args, **kwargs) - except IOError as error: - PRETTY.warning(f"Error duing operation '{message}': {error}") - PRETTY.warning( - f"Retrying operation '{message}'. Remaining retries: {max_retries - 1 - i}") - return cast(TFun, inner) - return retry diff --git a/PFERD/ilias/__init__.py b/PFERD/ilias/__init__.py deleted file mode 100644 index 0a5f08b..0000000 --- a/PFERD/ilias/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -Synchronizing files from ILIAS instances (https://www.ilias.de/). -""" - -from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator -from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter, - IliasElementType) -from .downloader import (IliasDownloader, IliasDownloadInfo, - IliasDownloadStrategy, download_everything, - download_modified_or_new) diff --git a/PFERD/ilias/authenticators.py b/PFERD/ilias/authenticators.py deleted file mode 100644 index 4b99dd8..0000000 --- a/PFERD/ilias/authenticators.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Authenticators that can obtain proper ILIAS session cookies. -""" - -import abc -import logging -from typing import Optional - -import bs4 -import requests - -from ..authenticators import TfaAuthenticator, UserPassAuthenticator -from ..utils import soupify - -LOGGER = logging.getLogger(__name__) - - -class IliasAuthenticator(abc.ABC): - # pylint: disable=too-few-public-methods - - """ - An authenticator that logs an existing requests session into an ILIAS - account. - """ - - @abc.abstractmethod - def authenticate(self, sess: requests.Session) -> None: - """ - Log a requests session into this authenticator's ILIAS account. - """ - - -class KitShibbolethAuthenticator(IliasAuthenticator): - # pylint: disable=too-few-public-methods - - """ - Authenticate via KIT's shibboleth system. - """ - - def __init__(self, authenticator: Optional[UserPassAuthenticator] = None) -> None: - if authenticator: - self._auth = authenticator - else: - self._auth = UserPassAuthenticator("KIT ILIAS Shibboleth") - - self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") - - def authenticate(self, sess: requests.Session) -> None: - """ - Performs the ILIAS Shibboleth authentication dance and saves the login - cookies it receieves. - - This function should only be called whenever it is detected that you're - not logged in. The cookies obtained should be good for a few minutes, - maybe even an hour or two. - """ - - # Equivalent: Click on "Mit KIT-Account anmelden" button in - # https://ilias.studium.kit.edu/login.php - LOGGER.debug("Begin authentication process with ILIAS") - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" - data = { - "sendLogin": "1", - "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "target": "/shib_login.php", - "home_organization_selection": "Mit KIT-Account anmelden", - } - soup = soupify(sess.post(url, data=data)) - - # Attempt to login using credentials, if necessary - while not self._login_successful(soup): - # Searching the form here so that this fails before asking for - # credentials rather than after asking. - form = soup.find("form", {"class": "full content", "method": "post"}) - action = form["action"] - - csrf_token = form.find("input", {"name": "csrf_token"})["value"] - - # Equivalent: Enter credentials in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - LOGGER.debug("Attempt to log in to Shibboleth using credentials") - url = "https://idp.scc.kit.edu" + action - data = { - "_eventId_proceed": "", - "j_username": self._auth.username, - "j_password": self._auth.password, - "csrf_token": csrf_token - } - soup = soupify(sess.post(url, data=data)) - - if self._tfa_required(soup): - soup = self._authenticate_tfa(sess, soup) - - if not self._login_successful(soup): - print("Incorrect credentials.") - self._auth.invalidate_credentials() - - # Equivalent: Being redirected via JS automatically - # (or clicking "Continue" if you have JS disabled) - LOGGER.debug("Redirect back to ILIAS with login information") - relay_state = soup.find("input", {"name": "RelayState"}) - saml_response = soup.find("input", {"name": "SAMLResponse"}) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" - data = { # using the info obtained in the while loop above - "RelayState": relay_state["value"], - "SAMLResponse": saml_response["value"], - } - sess.post(url, data=data) - - def _authenticate_tfa( - self, - session: requests.Session, - soup: bs4.BeautifulSoup - ) -> bs4.BeautifulSoup: - # Searching the form here so that this fails before asking for - # credentials rather than after asking. - form = soup.find("form", {"method": "post"}) - action = form["action"] - - # Equivalent: Enter token in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - LOGGER.debug("Attempt to log in to Shibboleth with TFA token") - url = "https://idp.scc.kit.edu" + action - data = { - "_eventId_proceed": "", - "j_tokenNumber": self._tfa_auth.get_token() - } - return soupify(session.post(url, data=data)) - - @staticmethod - def _login_successful(soup: bs4.BeautifulSoup) -> bool: - relay_state = soup.find("input", {"name": "RelayState"}) - saml_response = soup.find("input", {"name": "SAMLResponse"}) - return relay_state is not None and saml_response is not None - - @staticmethod - def _tfa_required(soup: bs4.BeautifulSoup) -> bool: - return soup.find(id="j_tokenNumber") is not None diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py deleted file mode 100644 index edab284..0000000 --- a/PFERD/ilias/crawler.py +++ /dev/null @@ -1,684 +0,0 @@ -""" -Contains an ILIAS crawler alongside helper functions. -""" - -import datetime -import json -import logging -import re -from enum import Enum -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union -from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, - urlunsplit) - -import bs4 -import requests - -from ..errors import FatalException, retry_on_io_exception -from ..logging import PrettyLogger -from ..utils import soupify -from .authenticators import IliasAuthenticator -from .date_demangler import demangle_date -from .downloader import IliasDownloadInfo - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -def _sanitize_path_name(name: str) -> str: - return name.replace("/", "-").replace("\\", "-") - - -class IliasElementType(Enum): - """ - The type of an ilias element. - """ - REGULAR_FOLDER = "REGULAR_FOLDER" - VIDEO_FOLDER = "VIDEO_FOLDER" - EXERCISE_FOLDER = "EXERCISE_FOLDER" - REGULAR_FILE = "REGULAR_FILE" - VIDEO_FILE = "VIDEO_FILE" - FORUM = "FORUM" - MEETING = "MEETING" - EXTERNAL_LINK = "EXTERNAL_LINK" - - def is_folder(self) -> bool: - """ - Returns whether this type is some kind of folder. - """ - return "FOLDER" in str(self.name) - - -IliasDirectoryFilter = Callable[[Path, IliasElementType], bool] - - -class IliasCrawlerEntry: - # pylint: disable=too-few-public-methods - """ - An ILIAS crawler entry used internally to find, catalogue and recursively crawl elements. - """ - - def __init__( - self, - path: Path, - url: Union[str, Callable[[], Optional[str]]], - entry_type: IliasElementType, - modification_date: Optional[datetime.datetime] - ): - self.path = path - if isinstance(url, str): - str_url = url - self.url: Callable[[], Optional[str]] = lambda: str_url - else: - self.url = url - self.entry_type = entry_type - self.modification_date = modification_date - - def to_download_info(self) -> Optional[IliasDownloadInfo]: - """ - Converts this crawler entry to an IliasDownloadInfo, if possible. - This method will only succeed for *File* types. - """ - if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]: - return IliasDownloadInfo(self.path, self.url, self.modification_date) - return None - - -class IliasCrawler: - # pylint: disable=too-few-public-methods - - """ - A crawler for ILIAS. - """ - - # pylint: disable=too-many-arguments - def __init__( - self, - base_url: str, - session: requests.Session, - authenticator: IliasAuthenticator, - dir_filter: IliasDirectoryFilter - ): - """ - Create a new ILIAS crawler. - """ - - self._base_url = base_url - self._session = session - self._authenticator = authenticator - self.dir_filter = dir_filter - - @staticmethod - def _url_set_query_param(url: str, param: str, value: str) -> str: - """ - Set a query parameter in an url, overwriting existing ones with the same name. - """ - scheme, netloc, path, query, fragment = urlsplit(url) - query_parameters = parse_qs(query) - query_parameters[param] = [value] - new_query_string = urlencode(query_parameters, doseq=True) - - return urlunsplit((scheme, netloc, path, new_query_string, fragment)) - - def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]: - """ - Crawls a given url *and all reachable elements in it*. - - Args: - url {str} -- the *full* url to crawl - """ - start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url) - return self._iterate_entries_to_download_infos(start_entries) - - def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: - """ - Starts the crawl process for a course, yielding a list of elements to (potentially) - download. - - Arguments: - course_id {str} -- the course id - - Raises: - FatalException: if an unrecoverable error occurs or the course id is not valid - """ - # Start crawling at the given course - root_url = self._url_set_query_param( - self._base_url + "/goto.php", "target", f"crs_{course_id}" - ) - - if not self._is_course_id_valid(root_url, course_id): - raise FatalException( - "Invalid course id? I didn't find anything looking like a course!" - ) - - # And treat it as a folder - entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) - return self._iterate_entries_to_download_infos(entries) - - def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: - response: requests.Response = self._session.get(root_url) - # We were redirected ==> Non-existant ID - if course_id not in response.url: - return False - - link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") - if not link_element: - return False - # It wasn't a course but a category list, forum, etc. - return "crs_" in link_element.get("value") - - def find_course_name(self, course_id: str) -> Optional[str]: - """ - Returns the name of a given course. None if it is not a valid course - or it could not be found. - """ - course_url = self._url_set_query_param( - self._base_url + "/goto.php", "target", f"crs_{course_id}" - ) - return self.find_element_name(course_url) - - def find_element_name(self, url: str) -> Optional[str]: - """ - Returns the name of the element at the given URL, if it can find one. - """ - focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus") - if not focus_element: - return None - return focus_element.text - - def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: - """ - Crawls the ILIAS personal desktop (and every subelements that can be reached from there). - - Raises: - FatalException: if an unrecoverable error occurs - """ - entries: List[IliasCrawlerEntry] = self._crawl_folder( - Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" - ) - return self._iterate_entries_to_download_infos(entries) - - def _iterate_entries_to_download_infos( - self, - entries: List[IliasCrawlerEntry] - ) -> List[IliasDownloadInfo]: - result: List[IliasDownloadInfo] = [] - entries_to_process: List[IliasCrawlerEntry] = entries.copy() - while len(entries_to_process) > 0: - entry = entries_to_process.pop() - - if entry.entry_type == IliasElementType.EXTERNAL_LINK: - PRETTY.not_searching(entry.path, "external link") - continue - if entry.entry_type == IliasElementType.FORUM: - PRETTY.not_searching(entry.path, "forum") - continue - - if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type): - PRETTY.not_searching(entry.path, "user filter") - continue - - download_info = entry.to_download_info() - if download_info is not None: - result.append(download_info) - continue - - url = entry.url() - - if url is None: - PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it") - continue - - PRETTY.searching(entry.path) - - if entry.entry_type == IliasElementType.EXERCISE_FOLDER: - entries_to_process += self._crawl_exercises(entry.path, url) - continue - if entry.entry_type == IliasElementType.REGULAR_FOLDER: - entries_to_process += self._crawl_folder(entry.path, url) - continue - if entry.entry_type == IliasElementType.VIDEO_FOLDER: - entries_to_process += self._crawl_video_directory(entry.path, url) - continue - - PRETTY.warning(f"Unknown type: {entry.entry_type}!") - - return result - - def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: - """ - Crawl all files in a folder-like element. - """ - soup = self._get_page(url, {}) - - if soup.find(id="headerimage"): - element: bs4.Tag = soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): - PRETTY.warning(f"Switched to crawling a video at {folder_path}") - if not self.dir_filter(folder_path, IliasElementType.VIDEO_FOLDER): - PRETTY.not_searching(folder_path, "user filter") - return [] - return self._crawl_video_directory(folder_path, url) - - result: List[IliasCrawlerEntry] = [] - - # Fetch all links and throw them to the general interpreter - links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle") - for link in links: - abs_url = self._abs_url_from_link(link) - element_path = Path(folder_path, _sanitize_path_name(link.getText().strip())) - element_type = self._find_type_from_link(element_path, link, abs_url) - - if element_type == IliasElementType.REGULAR_FILE: - result += self._crawl_file(folder_path, link, abs_url) - elif element_type == IliasElementType.MEETING: - meeting_name = str(element_path.name) - date_portion_str = meeting_name.split(" - ")[0] - date_portion = demangle_date(date_portion_str) - - if not date_portion: - result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] - continue - - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") \ - + rest_of_name - new_path = Path(folder_path, _sanitize_path_name(new_name)) - result += [ - IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None) - ] - elif element_type is not None: - result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)] - else: - PRETTY.warning(f"Found element without a type at {str(element_path)!r}") - - return result - - def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: - """ - Create an absolute url from an tag. - """ - return urljoin(self._base_url, link_tag.get("href")) - - @staticmethod - def _find_type_from_link( - path: Path, - link_element: bs4.Tag, - url: str - ) -> Optional[IliasElementType]: - """ - Decides which sub crawler to use for a given top level element. - """ - parsed_url = urlparse(url) - LOGGER.debug("Parsed url: %r", parsed_url) - - # file URLs contain "target=file" - if "target=file_" in parsed_url.query: - return IliasElementType.REGULAR_FILE - - # Skip forums - if "cmd=showThreads" in parsed_url.query: - return IliasElementType.FORUM - - # Everything with a ref_id can *probably* be opened to reveal nested things - # video groups, directories, exercises, etc - if "ref_id=" in parsed_url.query: - return IliasCrawler._find_type_from_folder_like(link_element, url) - - PRETTY.warning( - "Got unknown element type in switch. I am not sure what horror I found on the" - f" ILIAS page. The element was at {str(path)!r} and it is {link_element!r})" - ) - return None - - @staticmethod - def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]: - """ - Try crawling something that looks like a folder. - """ - # pylint: disable=too-many-return-statements - - found_parent: Optional[bs4.Tag] = None - - # We look for the outer div of our inner link, to find information around it - # (mostly the icon) - for parent in link_element.parents: - if "ilContainerListItemOuter" in parent["class"]: - found_parent = parent - break - - if found_parent is None: - PRETTY.warning(f"Could not find element icon for {url!r}") - return None - - # Find the small descriptive icon to figure out the type - img_tag: Optional[bs4.Tag] = found_parent.select_one("img.ilListItemIcon") - - if img_tag is None: - PRETTY.warning(f"Could not find image tag for {url!r}") - return None - - if "opencast" in str(img_tag["alt"]).lower(): - return IliasElementType.VIDEO_FOLDER - - if str(img_tag["src"]).endswith("icon_exc.svg"): - return IliasElementType.EXERCISE_FOLDER - - if str(img_tag["src"]).endswith("icon_webr.svg"): - return IliasElementType.EXTERNAL_LINK - - if str(img_tag["src"]).endswith("frm.svg"): - return IliasElementType.FORUM - - if str(img_tag["src"]).endswith("sess.svg"): - return IliasElementType.MEETING - - return IliasElementType.REGULAR_FOLDER - - @staticmethod - def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]: - """ - Crawls a file. - """ - # Files have a list of properties (type, modification date, size, etc.) - # In a series of divs. - # Find the parent containing all those divs, so we can filter our what we need - properties_parent: bs4.Tag = link_element.findParent( - "div", {"class": lambda x: "il_ContainerListItem" in x} - ).select_one(".il_ItemProperties") - # The first one is always the filetype - file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() - - # The rest does not have a stable order. Grab the whole text and reg-ex the date - # out of it - all_properties_text = properties_parent.getText().strip() - modification_date_match = re.search( - r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", - all_properties_text - ) - if modification_date_match is None: - modification_date = None - PRETTY.warning(f"Could not extract start date from {all_properties_text!r}") - else: - modification_date_str = modification_date_match.group(1) - modification_date = demangle_date(modification_date_str) - - # Grab the name from the link text - name = _sanitize_path_name(link_element.getText()) - full_path = Path(path, name + "." + file_type) - - return [ - IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date) - ] - - def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]: - """ - Crawl the video overview site. - """ - initial_soup = self._get_page(url, {}) - - # The page is actually emtpy but contains a much needed token in the link below. - # That token can be used to fetch the *actual* video listing - content_link: bs4.Tag = initial_soup.select_one("#tab_series a") - # Fetch the actual video listing. The given parameters return all videos (max 800) - # in a standalone html page - video_list_soup = self._get_page( - self._abs_url_from_link(content_link), - {"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - ) - - # If we find a page selected, we probably need to respect pagination - if self._is_paginated_video_page(video_list_soup): - second_stage_url = self._abs_url_from_link(content_link) - - return self._crawl_paginated_video_directory( - video_dir_path, video_list_soup, second_stage_url - ) - - return self._crawl_video_directory_second_stage(video_dir_path, video_list_soup) - - @staticmethod - def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool: - return soup.find(id=re.compile(r"tab_page_sel.+")) is not None - - def _crawl_paginated_video_directory( - self, - video_dir_path: Path, - paged_video_list_soup: bs4.BeautifulSoup, - second_stage_url: str - ) -> List[IliasCrawlerEntry]: - LOGGER.info("Found paginated video page, trying 800 elements") - - # Try to find the table id. This can be used to build the query parameter indicating - # you want 800 elements - - table_element: bs4.Tag = paged_video_list_soup.find( - name="table", id=re.compile(r"tbl_xoct_.+") - ) - if table_element is None: - PRETTY.warning( - "Could not increase elements per page (table not found)." - " Some might not be crawled!" - ) - return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) - - match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) - if match is None: - PRETTY.warning( - "Could not increase elements per page (table id not found)." - " Some might not be crawled!" - ) - return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) - table_id = match.group(1) - - extended_video_page = self._get_page( - second_stage_url, - {f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - ) - - if self._is_paginated_video_page(extended_video_page): - PRETTY.warning( - "800 elements do not seem to be enough (or I failed to fetch that many)." - " I will miss elements." - ) - - return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page) - - def _crawl_video_directory_second_stage( - self, - video_dir_path: Path, - video_list_soup: bs4.BeautifulSoup - ) -> List[IliasCrawlerEntry]: - """ - Crawls the "second stage" video page. This page contains the actual video urls. - """ - direct_download_links: List[bs4.Tag] = video_list_soup.findAll( - name="a", text=re.compile(r"\s*Download\s*") - ) - - # Video start links are marked with an "Abspielen" link - video_links: List[bs4.Tag] = video_list_soup.findAll( - name="a", text=re.compile(r"\s*Abspielen\s*") - ) - - results: List[IliasCrawlerEntry] = [] - - # We can download everything directly! - # FIXME: Sadly the download button is currently broken, so never do that - if False and len(direct_download_links) == len(video_links): - for link in direct_download_links: - results += self._crawl_single_video(video_dir_path, link, True) - else: - for link in video_links: - results += self._crawl_single_video(video_dir_path, link, False) - - return results - - def _crawl_single_video( - self, - parent_path: Path, - link: bs4.Tag, - direct_download: bool - ) -> List[IliasCrawlerEntry]: - """ - Crawl a single video based on its "Abspielen" link from the video listing. - """ - # The link is part of a table with multiple columns, describing metadata. - # 6th child (1 indexed) is the modification time string - modification_string = link.parent.parent.parent.select_one( - "td.std:nth-child(6)" - ).getText().strip() - modification_time = datetime.datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") - - title = link.parent.parent.parent.select_one( - "td.std:nth-child(3)" - ).getText().strip() - title += ".mp4" - - video_path: Path = Path(parent_path, _sanitize_path_name(title)) - - video_url = self._abs_url_from_link(link) - - # The video had a direct download button we can use instead - if direct_download: - LOGGER.debug("Using direct download for video %r", str(video_path)) - return [IliasCrawlerEntry( - video_path, video_url, IliasElementType.VIDEO_FILE, modification_time - )] - - return [IliasCrawlerEntry( - video_path, - self._crawl_video_url_from_play_link(video_url), - IliasElementType.VIDEO_FILE, - modification_time - )] - - def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]: - def inner() -> Optional[str]: - # Fetch the actual video page. This is a small wrapper page initializing a javscript - # player. Sadly we can not execute that JS. The actual video stream url is nowhere - # on the page, but defined in a JS object inside a script tag, passed to the player - # library. - # We do the impossible and RegEx the stream JSON object out of the page's HTML source - video_page_soup = soupify(self._session.get(play_url)) - regex: re.Pattern = re.compile( - r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE - ) - json_match = regex.search(str(video_page_soup)) - - if json_match is None: - PRETTY.warning(f"Could not find json stream info for {play_url!r}") - return None - json_str = json_match.group(1) - - # parse it - json_object = json.loads(json_str) - # and fetch the video url! - video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] - return video_url - return inner - - def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]: - """ - Crawl files offered for download in exercises. - """ - soup = self._get_page(url, {}) - - results: List[IliasCrawlerEntry] = [] - - # Each assignment is in an accordion container - assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer") - - for container in assignment_containers: - # Fetch the container name out of the header to use it in the path - container_name = container.select_one(".ilAssignmentHeader").getText().strip() - # Find all download links in the container (this will contain all the files) - files: List[bs4.Tag] = container.findAll( - name="a", - # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, - text="Download" - ) - - LOGGER.debug("Found exercise container %r", container_name) - - # Grab each file as you now have the link - for file_link in files: - # Two divs, side by side. Left is the name, right is the link ==> get left - # sibling - file_name = file_link.parent.findPrevious(name="div").getText().strip() - file_name = _sanitize_path_name(file_name) - url = self._abs_url_from_link(file_link) - - LOGGER.debug("Found file %r at %r", file_name, url) - - results.append(IliasCrawlerEntry( - Path(element_path, container_name, file_name), - url, - IliasElementType.REGULAR_FILE, - None # We do not have any timestamp - )) - - return results - - @retry_on_io_exception(3, "fetching webpage") - def _get_page(self, url: str, params: Dict[str, Any], - retry_count: int = 0) -> bs4.BeautifulSoup: - """ - Fetches a page from ILIAS, authenticating when needed. - """ - - if retry_count >= 4: - raise FatalException("Could not get a proper page after 4 tries. " - "Maybe your URL is wrong, authentication fails continuously, " - "your ILIAS connection is spotty or ILIAS is not well.") - - LOGGER.debug("Fetching %r", url) - - response = self._session.get(url, params=params) - content_type = response.headers["content-type"] - - if not content_type.startswith("text/html"): - raise FatalException( - f"Invalid content type {content_type} when crawling ilias page" - " {url!r} with {params!r}" - ) - - soup = soupify(response) - - if self._is_logged_in(soup): - return soup - - LOGGER.info("Not authenticated, changing that...") - - self._authenticator.authenticate(self._session) - - return self._get_page(url, params, retry_count + 1) - - @staticmethod - def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: - # Normal ILIAS pages - userlog = soup.find("li", {"id": "userlog"}) - if userlog is not None: - LOGGER.debug("Auth: Found #userlog") - return True - # Video listing embeds do not have complete ILIAS html. Try to match them by - # their video listing table - video_table = soup.find( - recursive=True, - name="table", - attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} - ) - if video_table is not None: - LOGGER.debug("Auth: Found #tbl_xoct.+") - return True - # The individual video player wrapper page has nothing of the above. - # Match it by its playerContainer. - if soup.select_one("#playerContainer") is not None: - LOGGER.debug("Auth: Found #playerContainer") - return True - return False diff --git a/PFERD/ilias/date_demangler.py b/PFERD/ilias/date_demangler.py deleted file mode 100644 index 2950d4d..0000000 --- a/PFERD/ilias/date_demangler.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Helper methods to demangle an ILIAS date. -""" - -import datetime -import locale -import logging -import re -from typing import Optional - -from ..logging import PrettyLogger - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -def demangle_date(date: str) -> Optional[datetime.datetime]: - """ - Demangle a given date in one of the following formats: - "Gestern, HH:MM" - "Heute, HH:MM" - "Morgen, HH:MM" - "dd. mon yyyy, HH:MM - """ - saved = locale.setlocale(locale.LC_ALL) - try: - try: - locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') - except locale.Error: - PRETTY.warning( - "Could not set language to german. Assuming you use english everywhere." - ) - - date = re.sub(r"\s+", " ", date) - date = re.sub("Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I) - date = re.sub("Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I) - date = re.sub("Morgen|Tomorrow", _tomorrow().strftime("%d. %b %Y"), date, re.I) - return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M") - except ValueError: - PRETTY.warning(f"Could not parse date {date!r}") - return None - finally: - locale.setlocale(locale.LC_ALL, saved) - - -def _yesterday() -> datetime.date: - return datetime.date.today() - datetime.timedelta(days=1) - - -def _tomorrow() -> datetime.date: - return datetime.date.today() + datetime.timedelta(days=1) diff --git a/PFERD/ilias/downloader.py b/PFERD/ilias/downloader.py deleted file mode 100644 index f6132bf..0000000 --- a/PFERD/ilias/downloader.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Contains a downloader for ILIAS.""" - -import datetime -import logging -import math -import os -from pathlib import Path, PurePath -from typing import Callable, List, Optional, Union - -import bs4 -import requests - -from ..errors import retry_on_io_exception -from ..logging import PrettyLogger -from ..organizer import Organizer -from ..tmp_dir import TmpDir -from ..transform import Transformable -from ..utils import soupify, stream_to_path -from .authenticators import IliasAuthenticator - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -class ContentTypeException(Exception): - """Thrown when the content type of the ilias element can not be handled.""" - - -class IliasDownloadInfo(Transformable): - """ - This class describes a single file to be downloaded. - """ - - def __init__( - self, - path: PurePath, - url: Union[str, Callable[[], Optional[str]]], - modifcation_date: Optional[datetime.datetime] - ): - super().__init__(path) - if isinstance(url, str): - string_url = url - self.url: Callable[[], Optional[str]] = lambda: string_url - else: - self.url = url - self.modification_date = modifcation_date - - -IliasDownloadStrategy = Callable[[Organizer, IliasDownloadInfo], bool] - - -def download_everything(organizer: Organizer, info: IliasDownloadInfo) -> bool: - # pylint: disable=unused-argument - """ - Accepts everything. - """ - return True - - -def download_modified_or_new(organizer: Organizer, info: IliasDownloadInfo) -> bool: - """ - Accepts new files or files with a more recent modification date. - """ - resolved_file = organizer.resolve(info.path) - if not resolved_file.exists() or info.modification_date is None: - return True - resolved_mod_time_seconds = resolved_file.stat().st_mtime - - # Download if the info is newer - if info.modification_date.timestamp() > resolved_mod_time_seconds: - return True - - PRETTY.ignored_file(info.path, "local file has newer or equal modification time") - return False - - -class IliasDownloader: - # pylint: disable=too-many-arguments - """A downloader for ILIAS.""" - - def __init__( - self, - tmp_dir: TmpDir, - organizer: Organizer, - session: requests.Session, - authenticator: IliasAuthenticator, - strategy: IliasDownloadStrategy, - timeout: int = 5 - ): - """ - Create a new IliasDownloader. - - The timeout applies to the download request only, as bwcloud uses IPv6 - and requests has a problem with that: https://github.com/psf/requests/issues/5522 - """ - - self._tmp_dir = tmp_dir - self._organizer = organizer - self._session = session - self._authenticator = authenticator - self._strategy = strategy - self._timeout = timeout - - def download_all(self, infos: List[IliasDownloadInfo]) -> None: - """ - Download multiple files one after the other. - """ - - for info in infos: - self.download(info) - - def download(self, info: IliasDownloadInfo) -> None: - """ - Download a file from ILIAS. - - Retries authentication until eternity if it could not fetch the file. - """ - - LOGGER.debug("Downloading %r", info) - - if not self._strategy(self._organizer, info): - self._organizer.mark(info.path) - return - - tmp_file = self._tmp_dir.new_path() - - @retry_on_io_exception(3, "downloading file") - def download_impl() -> bool: - if not self._try_download(info, tmp_file): - LOGGER.info("Re-Authenticating due to download failure: %r", info) - self._authenticator.authenticate(self._session) - raise IOError("Scheduled retry") - else: - return True - - if not download_impl(): - PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...") - return - - dst_path = self._organizer.accept_file(tmp_file, info.path) - if dst_path and info.modification_date: - os.utime( - dst_path, - times=( - math.ceil(info.modification_date.timestamp()), - math.ceil(info.modification_date.timestamp()) - ) - ) - - def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: - url = info.url() - if url is None: - PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") - return True - - with self._session.get(url, stream=True, timeout=self._timeout) as response: - content_type = response.headers["content-type"] - has_content_disposition = "content-disposition" in response.headers - - if content_type.startswith("text/html") and not has_content_disposition: - if self._is_logged_in(soupify(response)): - raise ContentTypeException("Attempting to download a web page, not a file") - - return False - - # Yay, we got the file :) - stream_to_path(response, target, info.path.name) - return True - - @staticmethod - def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: - userlog = soup.find("li", {"id": "userlog"}) - return userlog is not None diff --git a/PFERD/ipd.py b/PFERD/ipd.py deleted file mode 100644 index ece6a97..0000000 --- a/PFERD/ipd.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Utility functions and a scraper/downloader for the IPD pages. -""" -import datetime -import logging -import math -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, List, Optional -from urllib.parse import urljoin - -import bs4 -import requests - -from PFERD.errors import FatalException -from PFERD.utils import soupify - -from .logging import PrettyLogger -from .organizer import Organizer -from .tmp_dir import TmpDir -from .transform import Transformable -from .utils import stream_to_path - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -@dataclass -class IpdDownloadInfo(Transformable): - """ - Information about an ipd entry. - """ - url: str - modification_date: Optional[datetime.datetime] - - -IpdDownloadStrategy = Callable[[Organizer, IpdDownloadInfo], bool] - - -def ipd_download_new_or_modified(organizer: Organizer, info: IpdDownloadInfo) -> bool: - """ - Accepts new files or files with a more recent modification date. - """ - resolved_file = organizer.resolve(info.path) - if not resolved_file.exists(): - return True - if not info.modification_date: - PRETTY.ignored_file(info.path, "could not find modification time, file exists") - return False - - resolved_mod_time_seconds = resolved_file.stat().st_mtime - - # Download if the info is newer - if info.modification_date.timestamp() > resolved_mod_time_seconds: - return True - - PRETTY.ignored_file(info.path, "local file has newer or equal modification time") - return False - - -class IpdCrawler: - # pylint: disable=too-few-public-methods - """ - A crawler for IPD pages. - """ - - def __init__(self, base_url: str): - self._base_url = base_url - - def _abs_url_from_link(self, link_tag: bs4.Tag) -> str: - """ - Create an absolute url from an tag. - """ - return urljoin(self._base_url, link_tag.get("href")) - - def crawl(self) -> List[IpdDownloadInfo]: - """ - Crawls the playlist given in the constructor. - """ - page = soupify(requests.get(self._base_url)) - - items: List[IpdDownloadInfo] = [] - - def is_relevant_url(x: str) -> bool: - return x.endswith(".pdf") or x.endswith(".c") or x.endswith(".java") or x.endswith(".zip") - - for link in page.findAll(name="a", attrs={"href": lambda x: x and is_relevant_url(x)}): - href: str = link.attrs.get("href") - name = href.split("/")[-1] - - modification_date: Optional[datetime.datetime] = None - try: - enclosing_row: bs4.Tag = link.findParent(name="tr") - if enclosing_row: - date_text = enclosing_row.find(name="td").text - modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y") - except ValueError: - modification_date = None - - items.append(IpdDownloadInfo( - Path(name), - url=self._abs_url_from_link(link), - modification_date=modification_date - )) - - return items - - -class IpdDownloader: - """ - A downloader for ipd files. - """ - - def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy): - self._tmp_dir = tmp_dir - self._organizer = organizer - self._strategy = strategy - self._session = requests.session() - - def download_all(self, infos: List[IpdDownloadInfo]) -> None: - """ - Download multiple files one after the other. - """ - for info in infos: - self.download(info) - - def download(self, info: IpdDownloadInfo) -> None: - """ - Download a single file. - """ - if not self._strategy(self._organizer, info): - self._organizer.mark(info.path) - return - - with self._session.get(info.url, stream=True) as response: - if response.status_code == 200: - tmp_file = self._tmp_dir.new_path() - stream_to_path(response, tmp_file, info.path.name) - dst_path = self._organizer.accept_file(tmp_file, info.path) - - if dst_path and info.modification_date: - os.utime( - dst_path, - times=( - math.ceil(info.modification_date.timestamp()), - math.ceil(info.modification_date.timestamp()) - ) - ) - - elif response.status_code == 403: - raise FatalException("Received 403. Are you not using the KIT VPN?") - else: - PRETTY.warning(f"Could not download file, got response {response.status_code}") diff --git a/PFERD/location.py b/PFERD/location.py deleted file mode 100644 index 7f4c8ca..0000000 --- a/PFERD/location.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Contains a Location class for objects with an inherent path. -""" - -from pathlib import Path, PurePath - - -class ResolveException(Exception): - """An exception while resolving a file.""" - # TODO take care of this when doing exception handling - - -class Location: - """ - An object that has an inherent path. - """ - - def __init__(self, path: Path): - self._path = path.resolve() - - @property - def path(self) -> Path: - """ - This object's location. - """ - - return self._path - - def resolve(self, target: PurePath) -> Path: - """ - Resolve a file relative to the path of this location. - - Raises a [ResolveException] if the file is outside the given directory. - """ - absolute_path = self.path.joinpath(target).resolve() - - # TODO Make this less inefficient - if self.path not in absolute_path.parents: - raise ResolveException(f"Path {target} is not inside directory {self.path}") - - return absolute_path diff --git a/PFERD/logging.py b/PFERD/logging.py deleted file mode 100644 index c25019e..0000000 --- a/PFERD/logging.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Contains a few logger utility functions and implementations. -""" - -import logging -from typing import Optional - -from rich._log_render import LogRender -from rich.console import Console -from rich.style import Style -from rich.text import Text -from rich.theme import Theme - -from .download_summary import DownloadSummary -from .utils import PathLike, to_path - -STYLE = "{" -FORMAT = "[{levelname:<7}] {message}" -DATE_FORMAT = "%F %T" - - -def enable_logging(name: str = "PFERD", level: int = logging.INFO) -> None: - """ - Enable and configure logging via the logging module. - """ - - logger = logging.getLogger(name) - logger.setLevel(level) - logger.addHandler(RichLoggingHandler(level=level)) - - # This should be logged by our own handler, and not the root logger's - # default handler, so we don't pass it on to the root logger. - logger.propagate = False - - -class RichLoggingHandler(logging.Handler): - """ - A logging handler that uses rich for highlighting - """ - - def __init__(self, level: int) -> None: - super().__init__(level=level) - self.console = Console(theme=Theme({ - "logging.level.warning": Style(color="yellow") - })) - self._log_render = LogRender(show_level=True, show_time=False, show_path=False) - - def emit(self, record: logging.LogRecord) -> None: - """ - Invoked by logging. - """ - log_style = f"logging.level.{record.levelname.lower()}" - message = self.format(record) - - level = Text() - level.append(record.levelname, log_style) - message_text = Text.from_markup(message) - - self.console.print( - self._log_render( - self.console, - [message_text], - level=level, - ) - ) - - -class PrettyLogger: - """ - A logger that prints some specially formatted log messages in color. - """ - - def __init__(self, logger: logging.Logger) -> None: - self.logger = logger - - @staticmethod - def _format_path(path: PathLike) -> str: - return repr(str(to_path(path))) - - def error(self, message: str) -> None: - """ - Print an error message indicating some operation fatally failed. - """ - self.logger.error( - f"[bold red]{message}[/bold red]" - ) - - def warning(self, message: str) -> None: - """ - Print a warning message indicating some operation failed, but the error can be recovered - or ignored. - """ - self.logger.warning( - f"[bold yellow]{message}[/bold yellow]" - ) - - def modified_file(self, path: PathLike) -> None: - """ - An existing file has changed. - """ - - self.logger.info( - f"[bold magenta]Modified {self._format_path(path)}.[/bold magenta]" - ) - - def new_file(self, path: PathLike) -> None: - """ - A new file has been downloaded. - """ - - self.logger.info( - f"[bold green]Created {self._format_path(path)}.[/bold green]" - ) - - def deleted_file(self, path: PathLike) -> None: - """ - A file has been deleted. - """ - - self.logger.info( - f"[bold red]Deleted {self._format_path(path)}.[/bold red]" - ) - - def ignored_file(self, path: PathLike, reason: str) -> None: - """ - File was not downloaded or modified. - """ - - self.logger.info( - f"[dim]Ignored {self._format_path(path)} " - f"([/dim]{reason}[dim]).[/dim]" - ) - - def searching(self, path: PathLike) -> None: - """ - A crawler searches a particular object. - """ - - self.logger.info(f"Searching {self._format_path(path)}") - - def not_searching(self, path: PathLike, reason: str) -> None: - """ - A crawler does not search a particular object. - """ - - self.logger.info( - f"[dim]Not searching {self._format_path(path)} " - f"([/dim]{reason}[dim]).[/dim]" - ) - - def summary(self, download_summary: DownloadSummary) -> None: - """ - Prints a download summary. - """ - self.logger.info("") - self.logger.info("[bold cyan]Download Summary[/bold cyan]") - if not download_summary.has_updates(): - self.logger.info("[bold dim]Nothing changed![/bold dim]") - return - - for new_file in download_summary.new_files: - self.new_file(new_file) - for modified_file in download_summary.modified_files: - self.modified_file(modified_file) - for deleted_files in download_summary.deleted_files: - self.deleted_file(deleted_files) - - def starting_synchronizer( - self, - target_directory: PathLike, - synchronizer_name: str, - subject: Optional[str] = None, - ) -> None: - """ - A special message marking that a synchronizer has been started. - """ - - subject_str = f"{subject} " if subject else "" - self.logger.info("") - self.logger.info(( - f"[bold cyan]Synchronizing " - f"{subject_str}to {self._format_path(target_directory)} " - f"using the {synchronizer_name} synchronizer.[/bold cyan]" - )) diff --git a/PFERD/organizer.py b/PFERD/organizer.py deleted file mode 100644 index fe5052b..0000000 --- a/PFERD/organizer.py +++ /dev/null @@ -1,224 +0,0 @@ -"""A simple helper for managing downloaded files. - -A organizer is bound to a single directory. -""" - -import filecmp -import logging -import os -import shutil -from enum import Enum -from pathlib import Path, PurePath -from typing import Callable, List, Optional, Set - -from .download_summary import DownloadSummary -from .location import Location -from .logging import PrettyLogger -from .utils import prompt_yes_no - -LOGGER = logging.getLogger(__name__) -PRETTY = PrettyLogger(LOGGER) - - -class ConflictType(Enum): - """ - The type of the conflict. A file might not exist anymore and will be deleted - or it might be overwritten with a newer version. - - FILE_OVERWRITTEN: An existing file will be updated - MARKED_FILE_OVERWRITTEN: A file is written for the second+ time in this run - FILE_DELETED: The file was deleted - """ - FILE_OVERWRITTEN = "overwritten" - MARKED_FILE_OVERWRITTEN = "marked_file_overwritten" - FILE_DELETED = "deleted" - - -class FileConflictResolution(Enum): - """ - The reaction when confronted with a file conflict: - - DESTROY_EXISTING: Delete/overwrite the current file - KEEP_EXISTING: Keep the current file - DEFAULT: Do whatever the PFERD authors thought is sensible - PROMPT: Interactively ask the user - """ - - DESTROY_EXISTING = "destroy" - - KEEP_EXISTING = "keep" - - DEFAULT = "default" - - PROMPT = "prompt" - - -FileConflictResolver = Callable[[PurePath, ConflictType], FileConflictResolution] - - -def resolve_prompt_user(_path: PurePath, conflict: ConflictType) -> FileConflictResolution: - """ - Resolves conflicts by asking the user if a file was written twice or will be deleted. - """ - if conflict == ConflictType.FILE_OVERWRITTEN: - return FileConflictResolution.DESTROY_EXISTING - return FileConflictResolution.PROMPT - - -class FileAcceptException(Exception): - """An exception while accepting a file.""" - - -class Organizer(Location): - """A helper for managing downloaded files.""" - - def __init__(self, path: Path, conflict_resolver: FileConflictResolver = resolve_prompt_user): - """Create a new organizer for a given path.""" - super().__init__(path) - self._known_files: Set[Path] = set() - - # Keep the root dir - self._known_files.add(path.resolve()) - - self.download_summary = DownloadSummary() - - self.conflict_resolver = conflict_resolver - - def accept_file(self, src: Path, dst: PurePath) -> Optional[Path]: - """ - Move a file to this organizer and mark it. - - Returns the path the file was moved to, to allow the caller to adjust the metadata. - As you might still need to adjust the metadata when the file was identical - (e.g. update the timestamp), the path is also returned in this case. - In all other cases (ignored, not overwritten, etc.) this method returns None. - """ - # Windows limits the path length to 260 for *some* historical reason - # If you want longer paths, you will have to add the "\\?\" prefix in front of - # your path... - # See: - # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation - if os.name == 'nt': - src_absolute = Path("\\\\?\\" + str(src.resolve())) - dst_absolute = Path("\\\\?\\" + str(self.resolve(dst))) - else: - src_absolute = src.resolve() - dst_absolute = self.resolve(dst) - - if not src_absolute.exists(): - raise FileAcceptException("Source file does not exist") - - if not src_absolute.is_file(): - raise FileAcceptException("Source is a directory") - - LOGGER.debug("Copying %s to %s", src_absolute, dst_absolute) - - if self._is_marked(dst): - PRETTY.warning(f"File {str(dst_absolute)!r} was already written!") - conflict = ConflictType.MARKED_FILE_OVERWRITTEN - if self._resolve_conflict("Overwrite file?", dst_absolute, conflict, default=False): - PRETTY.ignored_file(dst_absolute, "file was written previously") - return None - - # Destination file is directory - if dst_absolute.exists() and dst_absolute.is_dir(): - prompt = f"Overwrite folder {dst_absolute} with file?" - conflict = ConflictType.FILE_OVERWRITTEN - if self._resolve_conflict(prompt, dst_absolute, conflict, default=False): - shutil.rmtree(dst_absolute) - else: - PRETTY.warning(f"Could not add file {str(dst_absolute)!r}") - return None - - # Destination file exists - if dst_absolute.exists() and dst_absolute.is_file(): - if filecmp.cmp(str(src_absolute), str(dst_absolute), shallow=False): - # Bail out, nothing more to do - PRETTY.ignored_file(dst_absolute, "same file contents") - self.mark(dst) - return dst_absolute - - prompt = f"Overwrite file {dst_absolute}?" - conflict = ConflictType.FILE_OVERWRITTEN - if not self._resolve_conflict(prompt, dst_absolute, conflict, default=True): - PRETTY.ignored_file(dst_absolute, "user conflict resolution") - return None - - self.download_summary.add_modified_file(dst_absolute) - PRETTY.modified_file(dst_absolute) - else: - self.download_summary.add_new_file(dst_absolute) - PRETTY.new_file(dst_absolute) - - # Create parent dir if needed - dst_parent_dir: Path = dst_absolute.parent - dst_parent_dir.mkdir(exist_ok=True, parents=True) - - # Move file - shutil.move(str(src_absolute), str(dst_absolute)) - - self.mark(dst) - - return dst_absolute - - def mark(self, path: PurePath) -> None: - """Mark a file as used so it will not get cleaned up.""" - absolute_path = self.resolve(path) - self._known_files.add(absolute_path) - LOGGER.debug("Tracked %s", absolute_path) - - def _is_marked(self, path: PurePath) -> bool: - """ - Checks whether a file is marked. - """ - absolute_path = self.resolve(path) - return absolute_path in self._known_files - - def cleanup(self) -> None: - """Remove all untracked files in the organizer's dir.""" - LOGGER.debug("Deleting all untracked files...") - - self._cleanup(self.path) - - def _cleanup(self, start_dir: Path) -> None: - if not start_dir.exists(): - return - paths: List[Path] = list(start_dir.iterdir()) - - # Recursively clean paths - for path in paths: - if path.is_dir(): - self._cleanup(path) - else: - if path.resolve() not in self._known_files: - self._delete_file_if_confirmed(path) - - # Delete dir if it was empty and untracked - dir_empty = len(list(start_dir.iterdir())) == 0 - if start_dir.resolve() not in self._known_files and dir_empty: - start_dir.rmdir() - - def _delete_file_if_confirmed(self, path: Path) -> None: - prompt = f"Do you want to delete {path}" - - if self._resolve_conflict(prompt, path, ConflictType.FILE_DELETED, default=False): - self.download_summary.add_deleted_file(path) - path.unlink() - else: - PRETTY.ignored_file(path, "user conflict resolution") - - def _resolve_conflict( - self, prompt: str, path: Path, conflict: ConflictType, default: bool - ) -> bool: - if not self.conflict_resolver: - return prompt_yes_no(prompt, default=default) - - result = self.conflict_resolver(path, conflict) - if result == FileConflictResolution.DEFAULT: - return default - if result == FileConflictResolution.KEEP_EXISTING: - return False - if result == FileConflictResolution.DESTROY_EXISTING: - return True - - return prompt_yes_no(prompt, default=default) diff --git a/PFERD/progress.py b/PFERD/progress.py deleted file mode 100644 index 6ad098f..0000000 --- a/PFERD/progress.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -A small progress bar implementation. -""" -import sys -from dataclasses import dataclass -from types import TracebackType -from typing import Optional, Type - -import requests -from rich.console import Console -from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, - TextColumn, TimeRemainingColumn, - TransferSpeedColumn) - -_progress: Progress = Progress( - TextColumn("[bold blue]{task.fields[name]}", justify="right"), - BarColumn(bar_width=None), - "[progress.percentage]{task.percentage:>3.1f}%", - "•", - DownloadColumn(), - "•", - TransferSpeedColumn(), - "•", - TimeRemainingColumn(), - console=Console(file=sys.stdout), - transient=True -) - - -def size_from_headers(response: requests.Response) -> Optional[int]: - """ - Return the size of the download based on the response headers. - - Arguments: - response {requests.Response} -- the response - - Returns: - Optional[int] -- the size - """ - if "Content-Length" in response.headers: - return int(response.headers["Content-Length"]) - return None - - -@dataclass -class ProgressSettings: - """ - Settings you can pass to customize the progress bar. - """ - name: str - max_size: int - - -def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManager': - """ - Returns a context manager that displays progress - - Returns: - ProgressContextManager -- the progress manager - """ - return ProgressContextManager(settings) - - -class ProgressContextManager: - """ - A context manager used for displaying progress. - """ - - def __init__(self, settings: Optional[ProgressSettings]): - self._settings = settings - self._task_id: Optional[TaskID] = None - - def __enter__(self) -> 'ProgressContextManager': - """Context manager entry function.""" - if not self._settings: - return self - - _progress.start() - self._task_id = _progress.add_task( - self._settings.name, - total=self._settings.max_size, - name=self._settings.name - ) - return self - - # pylint: disable=useless-return - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], - ) -> Optional[bool]: - """Context manager exit function. Removes the task.""" - if self._task_id is None: - return None - - _progress.remove_task(self._task_id) - - if len(_progress.task_ids) == 0: - # We need to clean up after ourselves, as we were the last one - _progress.stop() - _progress.refresh() - - return None - - def advance(self, amount: float) -> None: - """ - Advances the progress bar. - """ - if self._task_id is not None: - _progress.advance(self._task_id, amount) diff --git a/PFERD/tmp_dir.py b/PFERD/tmp_dir.py deleted file mode 100644 index 51ade2d..0000000 --- a/PFERD/tmp_dir.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Helper functions and classes for temporary folders.""" - -import logging -import shutil -from pathlib import Path -from types import TracebackType -from typing import Optional, Type - -from .location import Location - -LOGGER = logging.getLogger(__name__) - - -class TmpDir(Location): - """A temporary folder that can create files or nested temp folders.""" - - def __init__(self, path: Path): - """Create a new temporary folder for the given path.""" - super().__init__(path) - self._counter = 0 - self.cleanup() - self.path.mkdir(parents=True, exist_ok=True) - - def __str__(self) -> str: - """Format the folder as a string.""" - return f"Folder at {self.path}" - - def __enter__(self) -> 'TmpDir': - """Context manager entry function.""" - return self - - # pylint: disable=useless-return - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], - ) -> Optional[bool]: - """Context manager exit function. Calls cleanup().""" - self.cleanup() - return None - - def new_path(self, prefix: Optional[str] = None) -> Path: - """ - Return a unique path inside the directory. Doesn't create a file or - directory. - """ - - name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" - - LOGGER.debug("Creating temp file %s", name) - - return self.resolve(Path(name)) - - def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir': - """ - Create a new nested temporary folder and return it. - """ - - name = f"{prefix if prefix else 'tmp'}-{self._inc_and_get_counter():03}" - sub_path = self.resolve(Path(name)) - sub_path.mkdir(parents=True) - - LOGGER.debug("Creating temp dir %s at %s", name, sub_path) - - return TmpDir(sub_path) - - def cleanup(self) -> None: - """Delete this folder and all contained files.""" - LOGGER.debug("Deleting temp folder %s", self.path) - - if self.path.resolve().exists(): - shutil.rmtree(self.path.resolve()) - - def _inc_and_get_counter(self) -> int: - """Get and increment the counter by one.""" - counter = self._counter - self._counter += 1 - return counter diff --git a/PFERD/transform.py b/PFERD/transform.py deleted file mode 100644 index a2152ba..0000000 --- a/PFERD/transform.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Transforms let the user define functions to decide where the downloaded files -should be placed locally. They let the user do more advanced things like moving -only files whose names match a regex, or renaming files from one numbering -scheme to another. -""" - -import os -import re -from dataclasses import dataclass -from pathlib import PurePath -from typing import Callable, List, Optional, TypeVar - -from .utils import PathLike, Regex, to_path, to_pattern - -Transform = Callable[[PurePath], Optional[PurePath]] - - -@dataclass -class Transformable: - """ - An object that can be transformed by a Transform. - """ - - path: PurePath - - -TF = TypeVar("TF", bound=Transformable) - - -def apply_transform( - transform: Transform, - transformables: List[TF], -) -> List[TF]: - """ - Apply a Transform to multiple Transformables, discarding those that were - not transformed by the Transform. - """ - - result: List[TF] = [] - for transformable in transformables: - new_path = transform(transformable.path) - if new_path: - transformable.path = new_path - result.append(transformable) - return result - -# Transform combinators - -def keep(path: PurePath) -> Optional[PurePath]: - return path - -def attempt(*args: Transform) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - for transform in args: - result = transform(path) - if result: - return result - return None - return inner - -def optionally(transform: Transform) -> Transform: - return attempt(transform, lambda path: path) - -def do(*args: Transform) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - current = path - for transform in args: - result = transform(current) - if result: - current = result - else: - return None - return current - return inner - -def predicate(pred: Callable[[PurePath], bool]) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - if pred(path): - return path - return None - return inner - -def glob(pattern: str) -> Transform: - return predicate(lambda path: path.match(pattern)) - -def move_dir(source_dir: PathLike, target_dir: PathLike) -> Transform: - source_path = to_path(source_dir) - target_path = to_path(target_dir) - def inner(path: PurePath) -> Optional[PurePath]: - if source_path in path.parents: - return target_path / path.relative_to(source_path) - return None - return inner - -def move(source: PathLike, target: PathLike) -> Transform: - source_path = to_path(source) - target_path = to_path(target) - def inner(path: PurePath) -> Optional[PurePath]: - if path == source_path: - return target_path - return None - return inner - -def rename(source: str, target: str) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - if path.name == source: - return path.with_name(target) - return None - return inner - -def re_move(regex: Regex, target: str) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - match = to_pattern(regex).fullmatch(str(path)) - if match: - groups = [match.group(0)] - groups.extend(match.groups()) - return PurePath(target.format(*groups)) - return None - return inner - -def re_rename(regex: Regex, target: str) -> Transform: - def inner(path: PurePath) -> Optional[PurePath]: - match = to_pattern(regex).fullmatch(path.name) - if match: - groups = [match.group(0)] - groups.extend(match.groups()) - return path.with_name(target.format(*groups)) - return None - return inner - - -def sanitize_windows_path(path: PurePath) -> PurePath: - """ - A small function to escape characters that are forbidden in windows path names. - This method is a no-op on other operating systems. - """ - # Escape windows illegal path characters - if os.name == 'nt': - sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)] - return PurePath(*sanitized_parts) - return path From 0c9167512c7345a54c60f493fd574a56c43800e1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 14 May 2021 21:28:38 +0200 Subject: [PATCH 108/524] Fix output dir I missed these while renaming the resolve function. Shame on me for not running mypy earlier. --- PFERD/output_dir.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 1be9a16..89c5839 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -86,7 +86,7 @@ class OutputDirectory: self._report = Report() - def register_reserved(self, path: PurePath): + def register_reserved(self, path: PurePath) -> None: self._report.mark_reserved(path) def _mark(self, path: PurePath) -> None: @@ -265,7 +265,7 @@ class OutputDirectory: heuristics = Heuristics(mtime) redownload = self._redownload if redownload is None else redownload on_conflict = self._on_conflict if on_conflict is None else on_conflict - local_path = self._resolve(path) + local_path = self.resolve(path) self._mark(path) @@ -281,7 +281,7 @@ class OutputDirectory: # Detect and solve local-file-remote-dir conflict for parent in path.parents: - local_parent = self._resolve(parent) + local_parent = self.resolve(parent) if local_parent.exists() and not local_parent.is_dir(): if await self._conflict_lfrd(on_conflict, path, parent): local_parent.unlink() From 1591cb9197e3e5e6b8b11a572543aa231d8a2653 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 14 May 2021 21:41:24 +0200 Subject: [PATCH 109/524] Add options to slow down local crawler These options are meant to make the local crawler behave more like a network-based crawler for purposes of testing and debugging other parts of the code base. --- CONFIG.md | 12 ++++++-- PFERD/conductor.py | 1 + PFERD/crawlers/local.py | 61 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 92c36ae..2cac906 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -102,9 +102,15 @@ authenticators is `type`: ### The `local` crawler This crawler crawls a local directory. It is really simple and mostly useful for -testing different setups. +testing different setups. The various delay options are meant to make the +crawler simulate a slower, network-based crawler. - `path`: Path to the local directory to crawl. (Required) +- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl + requests. (Optional) +- `download_delay`: Maximum artificial delay (in seconds) to simulate for + download requests. (Optional) +- `download_speed`: Download speed (in bytes per second) to simulate. (Optional) ## Authenticator types @@ -114,8 +120,8 @@ With this authenticator, the username and password can be set directly in the config file. If the username or password are not specified, the user is prompted via the terminal. -- `username`: The username (Optional) -- `password`: The password (Optional) +- `username`: The username. (Optional) +- `password`: The password. (Optional) ## Transformation rules diff --git a/PFERD/conductor.py b/PFERD/conductor.py index 5022a22..d50574e 100644 --- a/PFERD/conductor.py +++ b/PFERD/conductor.py @@ -17,6 +17,7 @@ class ProgressBar: def set_total(self, total: float) -> None: self._progress.update(self._taskid, total=total) + self._progress.start_task(self._taskid) class TerminalConductor: diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index fb08cc9..1677ff0 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -1,6 +1,8 @@ import asyncio import datetime +import random from pathlib import Path, PurePath +from typing import Optional from ..conductor import TerminalConductor from ..config import Config @@ -14,6 +16,24 @@ class LocalCrawlerSection(CrawlerSection): self.missing_value("path") return Path(value).expanduser() + def crawl_delay(self) -> Optional[float]: + value = self.s.getfloat("crawl_delay") + if value <= 0: + self.invalid_value("crawl_delay", value) + return value + + def download_delay(self) -> Optional[float]: + value = self.s.getfloat("download_delay") + if value <= 0: + self.invalid_value("download_delay", value) + return value + + def download_speed(self) -> Optional[int]: + value = self.s.getint("download_speed") + if value <= 0: + self.invalid_value("download_speed", value) + return value + class LocalCrawler(Crawler): def __init__( @@ -26,6 +46,14 @@ class LocalCrawler(Crawler): super().__init__(name, section, config, conductor) self._path = config.working_dir / section.path() + self._crawl_delay = section.crawl_delay() + self._download_delay = section.download_delay() + self._download_speed = section.download_speed() + + if self._download_speed: + self._block_size = self._download_speed // 10 + else: + self._block_size = 1024**2 # 1 MiB async def crawl(self) -> None: await self._crawl_path(self._path, PurePath()) @@ -41,28 +69,49 @@ class LocalCrawler(Crawler): async def _crawl_dir(self, path: Path, pure: PurePath) -> None: tasks = [] + async with self.crawl_bar(pure): + if self._crawl_delay: + await asyncio.sleep(random.uniform( + 0.5 * self._crawl_delay, + self._crawl_delay, + )) + for child in path.iterdir(): pure_child = pure / child.name tasks.append(self._crawl_path(child, pure_child)) + await asyncio.gather(*tasks) async def _crawl_file(self, path: Path, pure: PurePath) -> None: + stat = path.stat() + mtime = datetime.datetime.fromtimestamp(stat.st_mtime) + dl = await self.download(pure, mtime=mtime) + if not dl: + return + async with self.download_bar(path) as bar: - stat = path.stat() - mtime = datetime.datetime.fromtimestamp(stat.st_mtime) - dl = await self.download(pure, mtime=mtime) - if not dl: - return + if self._download_delay: + await asyncio.sleep(random.uniform( + 0.5 * self._download_delay, + self._download_delay, + )) bar.set_total(stat.st_size) async with dl as sink: with open(path, "rb") as f: while True: - data = f.read(1024**2) + data = f.read(self._block_size) if len(data) == 0: break + sink.file.write(data) bar.advance(len(data)) + + if self._download_speed: + delay = self._block_size / self._download_speed + delay = random.uniform(0.8 * delay, 1.2 * delay) + await asyncio.sleep(delay) + sink.done() From 296a169dd30e68a679624b2a53ef516281a51a0d Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 00:38:46 +0200 Subject: [PATCH 110/524] Make limiter logic more complex The limiter can now distinguish between crawl and download actions and has a fancy slot system and delay logic. --- CONFIG.md | 11 ++++++++ PFERD/config.py | 12 +++++++-- PFERD/crawler.py | 65 +++++++++++++++++++++++++++++++++--------------- PFERD/limiter.py | 65 ++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 126 insertions(+), 27 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 2cac906..a74eef3 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -64,6 +64,17 @@ crawlers: remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) +- `max_concurrent_crawls`: The maximum number of concurrent crawl actions. What + constitutes a crawl action might vary from crawler to crawler, but it usually + means an HTTP request of a page to analyze. (Default: 1) +- `max_concurrent_downloads`: The maximum number of concurrent download actions. + What constitutes a download action might vary from crawler to crawler, but it + usually means an HTTP request for a single file. (Default: 1) +- `request_delay`: Time (in seconds) that the crawler should wait between + subsequent requests. Can be used to avoid unnecessary strain for the crawl + target. Crawl and download actions are handled separately, meaning that a + download action might immediately follow a crawl action even if this is set to + a nonzero value. (Default: 0) Some crawlers may also require credentials for authentication. To configure how the crawler obtains its credentials, the `auth` option is used. It is set to the diff --git a/PFERD/config.py b/PFERD/config.py index 56ea9af..0520f74 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -33,8 +33,16 @@ class Section: def error(self, key: str, desc: str) -> NoReturn: raise ConfigFormatException(self.s.name, key, desc) - def invalid_value(self, key: str, value: Any) -> NoReturn: - self.error(key, f"Invalid value: {value!r}") + def invalid_value( + self, + key: str, + value: Any, + reason: Optional[str], + ) -> NoReturn: + if reason is None: + self.error(key, f"Invalid value {value!r}") + else: + self.error(key, f"Invalid value {value!r}: {reason}") def missing_value(self, key: str) -> NoReturn: self.error(key, "Missing value") diff --git a/PFERD/crawler.py b/PFERD/crawler.py index ece62c1..f506294 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -139,6 +139,28 @@ class CrawlerSection(Section): def transform(self) -> str: return self.s.get("transform", "") + def max_concurrent_crawls(self) -> int: + value = self.s.getint("max_concurrent_crawls", fallback=1) + if value <= 0: + self.invalid_value("max_concurrent_crawls", value, + "Must be greater than 0") + return value + + def max_concurrent_downloads(self) -> int: + value = self.s.getint("max_concurrent_downloads", fallback=1) + + if value <= 0: + self.invalid_value("max_concurrent_downloads", value, + "Must be greater than 0") + return value + + def request_delay(self) -> float: + value = self.s.getfloat("request_delay", fallback=0.0) + if value < 0: + self.invalid_value("request_delay", value, + "Must be greater than or equal to 0") + return value + def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: value = self.s.get("auth") if value is None: @@ -168,9 +190,14 @@ class Crawler(ABC): self.name = name self._conductor = conductor - self._limiter = Limiter() self.error_free = True + self._limiter = Limiter( + crawl_limit=section.max_concurrent_crawls(), + download_limit=section.max_concurrent_downloads(), + delay=section.request_delay(), + ) + try: self._transformer = Transformer(section.transform()) except RuleParseException as e: @@ -210,28 +237,26 @@ class Crawler(ABC): return self._conductor.exclusive_output() @asynccontextmanager - async def progress_bar( - self, - desc: str, - total: Optional[int] = None, - ) -> AsyncIterator[ProgressBar]: - async with self._limiter.limit(): - with self._conductor.progress_bar(desc, total=total) as bar: - yield bar - - def crawl_bar(self, path: PurePath) -> AsyncContextManager[ProgressBar]: - pathstr = escape(str(path)) - desc = f"[bold magenta]Crawling[/bold magenta] {pathstr}" - return self.progress_bar(desc) - - def download_bar( + async def crawl_bar( self, path: PurePath, total: Optional[int] = None, - ) -> AsyncContextManager[ProgressBar]: - pathstr = escape(str(path)) - desc = f"[bold green]Downloading[/bold green] {pathstr}" - return self.progress_bar(desc, total=total) + ) -> AsyncIterator[ProgressBar]: + desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" + async with self._limiter.limit_crawl(): + with self._conductor.progress_bar(desc, total=total) as bar: + yield bar + + @asynccontextmanager + async def download_bar( + self, + path: PurePath, + total: Optional[int] = None, + ) -> AsyncIterator[ProgressBar]: + desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" + async with self._limiter.limit_download(): + with self._conductor.progress_bar(desc, total=total) as bar: + yield bar async def download( self, diff --git a/PFERD/limiter.py b/PFERD/limiter.py index ae72fe6..6359221 100644 --- a/PFERD/limiter.py +++ b/PFERD/limiter.py @@ -1,13 +1,68 @@ import asyncio +import time from contextlib import asynccontextmanager -from typing import AsyncIterator +from dataclasses import dataclass +from typing import AsyncContextManager, AsyncIterator, Optional -class Limiter: - def __init__(self, limit: int = 10): - self._semaphore = asyncio.Semaphore(limit) +@dataclass +class Slot: + active: bool = False + last_left: Optional[float] = None + + +class SlotPool: + def __init__(self, limit: int, delay: float): + if limit <= 0: + raise ValueError("limit must be greater than 0") + + self._slots = [Slot() for _ in range(limit)] + self._delay = delay + + self._free = asyncio.Condition() + + def _acquire_slot(self) -> Optional[Slot]: + for slot in self._slots: + if not slot.active: + slot.active = True + return slot + + return None + + def _release_slot(self, slot: Slot) -> None: + slot.last_left = time.time() + slot.active = False @asynccontextmanager async def limit(self) -> AsyncIterator[None]: - async with self._semaphore: + slot: Slot + async with self._free: + while True: + if found_slot := self._acquire_slot(): + slot = found_slot + break + await self._free.wait() + + if slot.last_left is not None: + delay = slot.last_left + self._delay - time.time() + if delay > 0: + await asyncio.sleep(delay) + + try: yield + finally: + async with self._free: + self._release_slot(slot) + self._free.notify() + + +class Limiter: + def __init__(self, crawl_limit: int, download_limit: int, delay: float): + self._crawl_pool = SlotPool(crawl_limit, delay) + self._download_pool = SlotPool(download_limit, delay) + + def limit_crawl(self) -> AsyncContextManager[None]: + return self._crawl_pool.limit() + + def limit_download(self) -> AsyncContextManager[None]: + return self._crawl_pool.limit() From ed2e19a150004fa61544528195dfc4acf9b70ec2 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 00:39:55 +0200 Subject: [PATCH 111/524] Add reasons for invalid values --- PFERD/crawler.py | 16 +++++++++++++--- PFERD/crawlers/local.py | 9 ++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index f506294..48dfcb4 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -122,7 +122,12 @@ class CrawlerSection(Section): return Redownload.ALWAYS elif value == "always-smart": return Redownload.ALWAYS_SMART - self.invalid_value("redownload", value) + + self.invalid_value( + "redownload", + value, + "Must be 'never', 'never-smart', 'always' or 'always-smart'" + ) def on_conflict(self) -> OnConflict: value = self.s.get("on_conflict", "prompt") @@ -134,7 +139,12 @@ class CrawlerSection(Section): return OnConflict.REMOTE_FIRST elif value == "no-delete": return OnConflict.NO_DELETE - self.invalid_value("on_conflict", value) + + self.invalid_value( + "on_conflict", + value, + "Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'", + ) def transform(self) -> str: return self.s.get("transform", "") @@ -167,7 +177,7 @@ class CrawlerSection(Section): self.missing_value("auth") auth = authenticators.get(f"auth:{value}") if auth is None: - self.invalid_value("auth", value) + self.invalid_value("auth", value, "No such auth section exists") return auth diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 1677ff0..07e6133 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -19,19 +19,22 @@ class LocalCrawlerSection(CrawlerSection): def crawl_delay(self) -> Optional[float]: value = self.s.getfloat("crawl_delay") if value <= 0: - self.invalid_value("crawl_delay", value) + self.invalid_value("crawl_delay", value, + "Must be greater than 0") return value def download_delay(self) -> Optional[float]: value = self.s.getfloat("download_delay") if value <= 0: - self.invalid_value("download_delay", value) + self.invalid_value("download_delay", value, + "Must be greater than 0") return value def download_speed(self) -> Optional[int]: value = self.s.getint("download_speed") if value <= 0: - self.invalid_value("download_speed", value) + self.invalid_value("download_speed", value, + "Must be greater than 0") return value From b0f9e1e8b4fb22f7bbe0b5f1839bd405651d9eb1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 11:20:20 +0200 Subject: [PATCH 112/524] Add vscode directory to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c888722..2928b54 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /.venv/ /PFERD.egg-info/ __pycache__/ +/.vscode/ From acd674f0a076fba8bfaf64b90bfc3000d3f5cb73 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 13:21:38 +0200 Subject: [PATCH 113/524] Change limiter logic Now download tasks are a subset of all tasks. --- CONFIG.md | 19 +++++----- PFERD/crawler.py | 27 ++++++++------ PFERD/limiter.py | 93 +++++++++++++++++++++++++++++++----------------- 3 files changed, 85 insertions(+), 54 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index a74eef3..2338d8f 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -64,17 +64,14 @@ crawlers: remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) -- `max_concurrent_crawls`: The maximum number of concurrent crawl actions. What - constitutes a crawl action might vary from crawler to crawler, but it usually - means an HTTP request of a page to analyze. (Default: 1) -- `max_concurrent_downloads`: The maximum number of concurrent download actions. - What constitutes a download action might vary from crawler to crawler, but it - usually means an HTTP request for a single file. (Default: 1) -- `request_delay`: Time (in seconds) that the crawler should wait between - subsequent requests. Can be used to avoid unnecessary strain for the crawl - target. Crawl and download actions are handled separately, meaning that a - download action might immediately follow a crawl action even if this is set to - a nonzero value. (Default: 0) +- `max_concurrent_tasks`: The maximum number of concurrent tasks (such as + crawling or downloading). (Default: 1) +- `max_concurrent_downloads`: How many of those tasks can be download tasks at + the same time. Must not be greater than `max_concurrent_tasks`. When not set, + this is the same as `max_concurrent_tasks`. (Optional) +- `delay_between_tasks`: Time (in seconds) that the crawler should wait between + subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary + load for the crawl target. (Default: 0.0) Some crawlers may also require credentials for authentication. To configure how the crawler obtains its credentials, the `auth` option is used. It is set to the diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 48dfcb4..9ec5991 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -149,26 +149,31 @@ class CrawlerSection(Section): def transform(self) -> str: return self.s.get("transform", "") - def max_concurrent_crawls(self) -> int: - value = self.s.getint("max_concurrent_crawls", fallback=1) + def max_concurrent_tasks(self) -> int: + value = self.s.getint("max_concurrent_tasks", fallback=1) if value <= 0: - self.invalid_value("max_concurrent_crawls", value, + self.invalid_value("max_concurrent_tasks", value, "Must be greater than 0") return value def max_concurrent_downloads(self) -> int: - value = self.s.getint("max_concurrent_downloads", fallback=1) - + tasks = self.max_concurrent_tasks() + value = self.s.getint("max_concurrent_downloads", fallback=None) + if value is None: + return tasks if value <= 0: self.invalid_value("max_concurrent_downloads", value, "Must be greater than 0") + if value > tasks: + self.invalid_value("max_concurrent_downloads", value, + "Must not be greater than max_concurrent_tasks") return value - def request_delay(self) -> float: - value = self.s.getfloat("request_delay", fallback=0.0) + def delay_between_tasks(self) -> float: + value = self.s.getfloat("delay_between_tasks", fallback=0.0) if value < 0: - self.invalid_value("request_delay", value, - "Must be greater than or equal to 0") + self.invalid_value("delay_between_tasks", value, + "Must not be negative") return value def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: @@ -203,9 +208,9 @@ class Crawler(ABC): self.error_free = True self._limiter = Limiter( - crawl_limit=section.max_concurrent_crawls(), + task_limit=section.max_concurrent_tasks(), download_limit=section.max_concurrent_downloads(), - delay=section.request_delay(), + task_delay=section.delay_between_tasks(), ) try: diff --git a/PFERD/limiter.py b/PFERD/limiter.py index 6359221..3122a7a 100644 --- a/PFERD/limiter.py +++ b/PFERD/limiter.py @@ -2,7 +2,7 @@ import asyncio import time from contextlib import asynccontextmanager from dataclasses import dataclass -from typing import AsyncContextManager, AsyncIterator, Optional +from typing import AsyncIterator, Optional @dataclass @@ -11,15 +11,27 @@ class Slot: last_left: Optional[float] = None -class SlotPool: - def __init__(self, limit: int, delay: float): - if limit <= 0: - raise ValueError("limit must be greater than 0") +class Limiter: + def __init__( + self, + task_limit: int, + download_limit: int, + task_delay: float + ): + if task_limit <= 0: + raise ValueError("task limit must be at least 1") + if download_limit <= 0: + raise ValueError("download limit must be at least 1") + if download_limit > task_limit: + raise ValueError("download limit can't be greater than task limit") + if task_delay < 0: + raise ValueError("Task delay must not be negative") - self._slots = [Slot() for _ in range(limit)] - self._delay = delay + self._slots = [Slot() for _ in range(task_limit)] + self._downloads = download_limit + self._delay = task_delay - self._free = asyncio.Condition() + self._condition = asyncio.Condition() def _acquire_slot(self) -> Optional[Slot]: for slot in self._slots: @@ -29,40 +41,57 @@ class SlotPool: return None - def _release_slot(self, slot: Slot) -> None: - slot.last_left = time.time() - slot.active = False - - @asynccontextmanager - async def limit(self) -> AsyncIterator[None]: - slot: Slot - async with self._free: - while True: - if found_slot := self._acquire_slot(): - slot = found_slot - break - await self._free.wait() - + async def _wait_for_slot_delay(self, slot: Slot) -> None: if slot.last_left is not None: delay = slot.last_left + self._delay - time.time() if delay > 0: await asyncio.sleep(delay) + def _release_slot(self, slot: Slot) -> None: + slot.last_left = time.time() + slot.active = False + + @asynccontextmanager + async def limit_crawl(self) -> AsyncIterator[None]: + slot: Slot + async with self._condition: + while True: + if found_slot := self._acquire_slot(): + slot = found_slot + break + await self._condition.wait() + + await self._wait_for_slot_delay(slot) + try: yield finally: - async with self._free: + async with self._condition: self._release_slot(slot) - self._free.notify() + self._condition.notify_all() + @asynccontextmanager + async def limit_download(self) -> AsyncIterator[None]: + slot: Slot + async with self._condition: + while True: + if self._downloads <= 0: + await self._condition.wait() + continue -class Limiter: - def __init__(self, crawl_limit: int, download_limit: int, delay: float): - self._crawl_pool = SlotPool(crawl_limit, delay) - self._download_pool = SlotPool(download_limit, delay) + if found_slot := self._acquire_slot(): + slot = found_slot + self._downloads -= 1 + break - def limit_crawl(self) -> AsyncContextManager[None]: - return self._crawl_pool.limit() + await self._condition.wait() - def limit_download(self) -> AsyncContextManager[None]: - return self._crawl_pool.limit() + await self._wait_for_slot_delay(slot) + + try: + yield + finally: + async with self._condition: + self._release_slot(slot) + self._downloads += 1 + self._condition.notify_all() From 302b8c0c3466a51c29f919d519edf2b0ce8f40e8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 13:32:13 +0200 Subject: [PATCH 114/524] Fix errors loading local crawler config Apparently getint and getfloat may return a None even though this is not mentioned in their type annotations. --- CONFIG.md | 4 ++-- PFERD/crawlers/local.py | 36 +++++++++++++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 2338d8f..dd38c11 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -115,9 +115,9 @@ crawler simulate a slower, network-based crawler. - `path`: Path to the local directory to crawl. (Required) - `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl - requests. (Optional) + requests. (Default: 0.0) - `download_delay`: Maximum artificial delay (in seconds) to simulate for - download requests. (Optional) + download requests. (Default: 0.0) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) ## Authenticator types diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 07e6133..99bc700 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -16,23 +16,23 @@ class LocalCrawlerSection(CrawlerSection): self.missing_value("path") return Path(value).expanduser() - def crawl_delay(self) -> Optional[float]: - value = self.s.getfloat("crawl_delay") - if value <= 0: + def crawl_delay(self) -> float: + value = self.s.getfloat("crawl_delay", fallback=0.0) + if value < 0: self.invalid_value("crawl_delay", value, - "Must be greater than 0") + "Must not be negative") return value - def download_delay(self) -> Optional[float]: - value = self.s.getfloat("download_delay") - if value <= 0: + def download_delay(self) -> float: + value = self.s.getfloat("download_delay", fallback=0.0) + if value < 0: self.invalid_value("download_delay", value, - "Must be greater than 0") + "Must not be negative") return value def download_speed(self) -> Optional[int]: value = self.s.getint("download_speed") - if value <= 0: + if value is not None and value <= 0: self.invalid_value("download_speed", value, "Must be greater than 0") return value @@ -74,11 +74,10 @@ class LocalCrawler(Crawler): tasks = [] async with self.crawl_bar(pure): - if self._crawl_delay: - await asyncio.sleep(random.uniform( - 0.5 * self._crawl_delay, - self._crawl_delay, - )) + await asyncio.sleep(random.uniform( + 0.5 * self._crawl_delay, + self._crawl_delay, + )) for child in path.iterdir(): pure_child = pure / child.name @@ -94,11 +93,10 @@ class LocalCrawler(Crawler): return async with self.download_bar(path) as bar: - if self._download_delay: - await asyncio.sleep(random.uniform( - 0.5 * self._download_delay, - self._download_delay, - )) + await asyncio.sleep(random.uniform( + 0.5 * self._download_delay, + self._download_delay, + )) bar.set_total(stat.st_size) From b0f731bf84dfd60cc78f08dfbd6ed0992faba3c8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 14:03:15 +0200 Subject: [PATCH 115/524] Make crawlers use transformers --- PFERD/crawler.py | 9 ++++++++- PFERD/crawlers/local.py | 3 ++- PFERD/transformer.py | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 9ec5991..f8cf091 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -273,6 +273,9 @@ class Crawler(ABC): with self._conductor.progress_bar(desc, total=total) as bar: yield bar + def should_crawl(self, path: PurePath) -> bool: + return self._transformer.transform(path) is not None + async def download( self, path: PurePath, @@ -280,8 +283,12 @@ class Crawler(ABC): redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, ) -> Optional[AsyncContextManager[FileSink]]: + transformed_path = self._transformer.transform(path) + if transformed_path is None: + return None + return await self._output_dir.download( - path, mtime, redownload, on_conflict) + transformed_path, mtime, redownload, on_conflict) async def cleanup(self) -> None: await self._output_dir.cleanup() diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 99bc700..360a9a9 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -81,7 +81,8 @@ class LocalCrawler(Crawler): for child in path.iterdir(): pure_child = pure / child.name - tasks.append(self._crawl_path(child, pure_child)) + if self.should_crawl(child): + tasks.append(self._crawl_path(child, pure_child)) await asyncio.gather(*tasks) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 84332df..fb47c60 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -292,4 +292,4 @@ class Transformer: else: continue - return None + return path From f897d7c2e15f99780fc81945d107b88c1dc668e7 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 15:06:45 +0200 Subject: [PATCH 116/524] Add name variants for all arrows --- CONFIG.md | 57 ++++++++++++++++++++++++++++++++++++++++++-- PFERD/transformer.py | 44 +++++++++++++++++++++++++++------- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index dd38c11..cccc751 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -143,7 +143,8 @@ Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string escape syntax is supported. Trailing slashes are ignored. `TARGET` can be formatted like `SOURCE`, but it can also be a single exclamation mark without -quotes (`!`). `ARROW` is one of `-->`, `-exact->` and `-re->`. +quotes (`!`). `ARROW` is one of `-->`, `-exact->`, `-name->`, `-re->` and +`-name-re->` If a rule's target is `!`, this means that when the rule matches on a path, the corresponding file or directory is ignored. If a rule's target is missing, the @@ -157,6 +158,14 @@ that part of the path is replaced with `TARGET`. This means that the rule into `baz/xyz`. The rule `foo --> !` would ignore a directory named `foo` as well as all its contents. +### The `-name->` arrow + +The `-name->` arrow works similar to the `-->` arrow, but pretends it is in the +same directory as the file or directory it is applied to. For example, the rule +`bar -name-> baz` would convert `foo/bar` into `foo/baz` and `foo/bar/xyz` into +`foo/baz/xyz`. The rule `foo --> !` would ignore all files and directories named +`foo` as well as their contents. + ### The `-exact->` arrow The `-exact->` arrow requires the path to match `SOURCE` exactly. This means @@ -165,6 +174,14 @@ but `foo/bar/xyz` would be unaffected. Also, `foo -exact-> !` would only ignore `foo`, but not its contents (if it has any). The examples below show why this is useful. +### The `-name-exact->` arrow + +The `-name-exact->` arrow works similar to the `-exact->` arrow, but pretends it +is in the same directory as the file or directory it is applied to. For example, +the rule `bar -name-exact-> baz` would convert `foo/bar` into `foo/baz` but +`foo/bar/xyz` would be unaffected. The rule `foo --> !` would ignore only ignore +files and directories named `foo`, but not their contents. + ### The `-re->` arrow The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression @@ -186,10 +203,15 @@ example `{g2.lower()}` or `{g3.replace(' ', '_')}`. [3]: "Format String Syntax" +### The `-name-re->` arrow + +The `-name-re>` arrow works similar to the `-re->` arrow, but pretends it is in +the same directory as the file or directory it is applied to. + ### Example: Tutorials You have an ILIAS course with lots of tutorials, but are only interested in a -single one? +single one. ``` tutorials/ @@ -236,3 +258,34 @@ To do this, you can use the most powerful of arrows: The regex arrow. ``` Note the escaped backslashes on the `SOURCE` side. + +### Example: Crawl a python project + +You are crawling a python project and want to ignore all hidden files (files +whose name starts with a `.`), all `__pycache__` directories and all markdown +files (for some weird reason). + +``` +.gitignore +.mypy_cache/ +.venv/ +CONFIG.md +PFERD/ + |- __init__.py + |- __main__.py + |- __pycache__/ + |- authenticator.py + |- config.py + ... +README.md +... +``` + +For this task, the name arrows can be used. They are variants of the normal +arrows that only look at the file name instead of the entire path. + +``` +\..* -name-re-> ! +__pycache__ -name-> ! +.*\.md -name-re-> ! +``` diff --git a/PFERD/transformer.py b/PFERD/transformer.py index fb47c60..1b80433 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -77,6 +77,19 @@ class ExactRule(Rule): return False +class NameRule(Rule): + def __init__(self, subrule: Rule): + self._subrule = subrule + + def transform(self, path: PurePath) -> Union[PurePath, bool]: + name = PurePath(*path.parts[-1:]) + result = self._subrule.transform(name) + if isinstance(result, PurePath): + return path.parent / result + else: + return result + + class ReRule(Rule): def __init__(self, left: str, right: Union[str, bool]): self._left = left @@ -220,16 +233,25 @@ def parse_arrow(line: Line) -> str: name = [] while True: - if c := line.get(): - if c == "-": - break - else: - name.append(c) - line.advance() - else: + c = line.get() + if not c: raise RuleParseException(line, "Expected rest of arrow") + elif c == "-": + line.advance() + c = line.get() + if not c: + raise RuleParseException(line, "Expected rest of arrow") + elif c == ">": + line.advance() + break # End of arrow + else: + name.append("-") + name.append(c) + else: + name.append(c) + + line.advance() - line.expect("->") return "".join(name) @@ -261,10 +283,16 @@ def parse_rule(line: Line) -> Rule: # Dispatch if arrowname == "": return NormalRule(PurePath(left), rightpath) + elif arrowname == "name": + return NameRule(NormalRule(PurePath(left), rightpath)) elif arrowname == "exact": return ExactRule(PurePath(left), rightpath) + elif arrowname == "name-exact": + return NameRule(ExactRule(PurePath(left), rightpath)) elif arrowname == "re": return ReRule(left, right) + elif arrowname == "name-re": + return NameRule(ReRule(left, right)) else: line.index = arrowindex + 1 # For nicer error message raise RuleParseException(line, "Invalid arrow name") From a6fdf05ee91902806dcfa51ad9cec6b6e843947b Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 15:13:34 +0200 Subject: [PATCH 117/524] Allow variable whitespace in arrow rules --- CONFIG.md | 6 +++--- PFERD/transformer.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index cccc751..df3e8f2 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -285,7 +285,7 @@ For this task, the name arrows can be used. They are variants of the normal arrows that only look at the file name instead of the entire path. ``` -\..* -name-re-> ! -__pycache__ -name-> ! -.*\.md -name-re-> ! +\..* -name-re-> ! +__pycache__ -name-> ! +.*\.md -name-re-> ! ``` diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 1b80433..135baf2 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -255,6 +255,12 @@ def parse_arrow(line: Line) -> str: return "".join(name) +def parse_whitespace(line: Line) -> None: + line.expect(" ") + while line.get() == " ": + line.advance() + + def parse_rule(line: Line) -> Rule: # Parse left side leftindex = line.index @@ -264,13 +270,13 @@ def parse_rule(line: Line) -> Rule: raise RuleParseException(line, "Left side can't be '!'") # Parse arrow - line.expect(" ") + parse_whitespace(line) arrowindex = line.index arrowname = parse_arrow(line) # Parse right side if line.get(): - line.expect(" ") + parse_whitespace(line) right = parse_string(line) else: right = False From 595de88d964332782133c51846d468b0412b45e4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 15:18:16 +0200 Subject: [PATCH 118/524] Fix authenticator and crawler names Now, the "auth:" and "crawl:" parts are considered part of the name. This fixes crawlers not being able to find their authenticators. --- PFERD/config.py | 14 ++++++-------- PFERD/crawler.py | 2 +- PFERD/pferd.py | 6 +++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index 0520f74..66b882e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -139,19 +139,17 @@ class Config: def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] - for section_name, section_proxy in self._parser.items(): - if section_name.startswith("crawler:"): - crawler_name = section_name[8:] - result.append((crawler_name, section_proxy)) + for name, proxy in self._parser.items(): + if name.startswith("crawler:"): + result.append((name, proxy)) return result def authenticator_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] - for section_name, section_proxy in self._parser.items(): - if section_name.startswith("auth:"): - crawler_name = section_name[5:] - result.append((crawler_name, section_proxy)) + for name, proxy in self._parser.items(): + if name.startswith("auth:"): + result.append((name, proxy)) return result diff --git a/PFERD/crawler.py b/PFERD/crawler.py index f8cf091..f49eba8 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -180,7 +180,7 @@ class CrawlerSection(Section): value = self.s.get("auth") if value is None: self.missing_value("auth") - auth = authenticators.get(f"auth:{value}") + auth = authenticators.get(value) if auth is None: self.invalid_value("auth", value, "No such auth section exists") return auth diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 4500ba9..9154a80 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -25,7 +25,7 @@ class Pferd: def _load_authenticators(self) -> None: abort = False for name, section in self._config.authenticator_sections(): - print(f"[bold bright_cyan]Loading[/] auth:{escape(name)}") + print(f"[bold bright_cyan]Loading[/] {escape(name)}") authenticator_type = section.get("type") authenticator_constructor = AUTHENTICATORS.get(authenticator_type) if authenticator_constructor is None: @@ -48,7 +48,7 @@ class Pferd: def _load_crawlers(self) -> None: abort = False for name, section in self._config.crawler_sections(): - print(f"[bold bright_cyan]Loading[/] crawler:{escape(name)}") + print(f"[bold bright_cyan]Loading[/] {escape(name)}") crawler_type = section.get("type") crawler_constructor = CRAWLERS.get(crawler_type) if crawler_constructor is None: @@ -79,6 +79,6 @@ class Pferd: for name, crawler in self._crawlers.items(): print() - print(f"[bold bright_cyan]Running[/] crawler:{escape(name)}") + print(f"[bold bright_cyan]Running[/] {escape(name)}") await crawler.run() From b2a2b5999bd38abfebfcc8ee3d48dcd90ccb59b6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 15:18:51 +0200 Subject: [PATCH 119/524] Implement ILIAS auth and crawl home page This commit introduces the necessary machinery to authenticate with ILIAS and crawl the home page. It can't do much yet and just silently fetches the homepage. --- PFERD/crawlers/__init__.py | 3 + PFERD/crawlers/ilias.py | 209 +++++++++++++++++++++++++++++++++++++ PFERD/utils.py | 8 ++ setup.cfg | 1 + 4 files changed, 221 insertions(+) create mode 100644 PFERD/crawlers/ilias.py diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index b2e5af5..0ae2ca3 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -5,6 +5,7 @@ from ..authenticator import Authenticator from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler +from .ilias import IliasCrawler, IliasCrawlerSection from .local import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -18,4 +19,6 @@ CrawlerConstructor = Callable[[ CRAWLERS: Dict[str, CrawlerConstructor] = { "local": lambda n, s, c, t, a: LocalCrawler(n, LocalCrawlerSection(s), c, t), + "ilias": lambda n, s, c, t, a: + IliasCrawler(n, IliasCrawlerSection(s), c, t, a), } diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py new file mode 100644 index 0000000..84a7c15 --- /dev/null +++ b/PFERD/crawlers/ilias.py @@ -0,0 +1,209 @@ +from configparser import SectionProxy +from pathlib import PurePath +from typing import Any, Dict, Optional + +import aiohttp +from bs4 import BeautifulSoup +from PFERD.utils import soupify + +from ..authenticators import Authenticator +from ..conductor import TerminalConductor +from ..config import Config +from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, + arepeat) + + +class IliasCrawlerSection(CrawlerSection): + + def __init__(self, section: SectionProxy): + super().__init__(section) + + if not self.course_id() and not self.element_url(): + self.missing_value("course_id or element_url") + + def course_id(self) -> Optional[str]: + return self.s.get("course_id") + + def element_url(self) -> Optional[str]: + return self.s.get("element_url") + + def base_url(self) -> str: + return self.s.get("ilias_url", "https://ilias.studium.kit.edu/") + + def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: + value = self.s.get("tfa_auth") + if not value: + return None + + auth = authenticators.get(f"auth:{value}") + if auth is None: + self.invalid_value("auth", value, "No such auth section exists") + return auth + + +class IliasCrawler(HttpCrawler): + def __init__( + self, + name: str, + section: IliasCrawlerSection, + config: Config, + conductor: TerminalConductor, + authenticators: Dict[str, Authenticator] + ): + super().__init__(name, section, config, conductor) + + self._shibboleth_login = KitShibbolethLogin( + section.auth(authenticators), + section.tfa_auth(authenticators) + ) + self._base_url = section.base_url() + + self._course_id = section.course_id() + self._element_url = section.element_url() + + async def crawl(self) -> None: + async with self.crawl_bar(PurePath("/")) as bar: + soup = await self._get_page(self._base_url) + self.print("[green]Gotcha![/]") + + async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: + if retries_left < 0: + # TODO: Proper exception + raise RuntimeError("Get page failed too often") + async with self.session.get(url) as request: + soup = soupify(await request.read()) + if self._is_logged_in(soup): + return soup + + await self._shibboleth_login.login(self.session) + + return await self._get_page(url, retries_left - 1) + + @staticmethod + def _is_logged_in(soup: BeautifulSoup) -> bool: + # Normal ILIAS pages + userlog = soup.find("li", {"id": "userlog"}) + if userlog is not None: + return True + # Video listing embeds do not have complete ILIAS html. Try to match them by + # their video listing table + video_table = soup.find( + recursive=True, + name="table", + attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} + ) + if video_table is not None: + return True + # The individual video player wrapper page has nothing of the above. + # Match it by its playerContainer. + if soup.select_one("#playerContainer") is not None: + return True + return False + + +class KitShibbolethLogin: + """ + Login via KIT's shibboleth system. + """ + + def __init__(self, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator]) -> None: + self._auth = authenticator + self._tfa_auth = tfa_authenticator + + async def login(self, sess: aiohttp.ClientSession) -> None: + """ + Performs the ILIAS Shibboleth authentication dance and saves the login + cookies it receieves. + + This function should only be called whenever it is detected that you're + not logged in. The cookies obtained should be good for a few minutes, + maybe even an hour or two. + """ + + # Equivalent: Click on "Mit KIT-Account anmelden" button in + # https://ilias.studium.kit.edu/login.php + url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + data = { + "sendLogin": "1", + "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", + "target": "/shib_login.php", + "home_organization_selection": "Mit KIT-Account anmelden", + } + soup: BeautifulSoup = await _post(sess, url, data) + + # Attempt to login using credentials, if necessary + while not self._login_successful(soup): + # Searching the form here so that this fails before asking for + # credentials rather than after asking. + form = soup.find("form", {"class": "full content", "method": "post"}) + action = form["action"] + + csrf_token = form.find("input", {"name": "csrf_token"})["value"] + + # Equivalent: Enter credentials in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = "https://idp.scc.kit.edu" + action + username, password = await self._auth.credentials() + data = { + "_eventId_proceed": "", + "j_username": username, + "j_password": password, + "csrf_token": csrf_token + } + soup = await _post(sess, url, data) + + if self._tfa_required(soup): + soup = await self._authenticate_tfa(sess, soup) + + if not self._login_successful(soup): + self._auth.invalid_credentials() + + # Equivalent: Being redirected via JS automatically + # (or clicking "Continue" if you have JS disabled) + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + data = { # using the info obtained in the while loop above + "RelayState": relay_state["value"], + "SAMLResponse": saml_response["value"], + } + await sess.post(url, data=data) + + async def _authenticate_tfa( + self, + session: aiohttp.ClientSession, + soup: BeautifulSoup + ) -> BeautifulSoup: + if not self._tfa_auth: + raise RuntimeError("No 'tfa_auth' present but you use two-factor authentication!") + + _, tfa_token = await self._tfa_auth.credentials() + + # Searching the form here so that this fails before asking for + # credentials rather than after asking. + form = soup.find("form", {"method": "post"}) + action = form["action"] + + # Equivalent: Enter token in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = "https://idp.scc.kit.edu" + action + data = { + "_eventId_proceed": "", + "j_tokenNumber": tfa_token + } + return _post(session, url, data) + + @staticmethod + def _login_successful(soup: BeautifulSoup) -> bool: + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + return relay_state is not None and saml_response is not None + + @staticmethod + def _tfa_required(soup: BeautifulSoup) -> bool: + return soup.find(id="j_tokenNumber") is not None + + +async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: + async with session.post(url, data=data) as response: + return soupify(await response.read()) diff --git a/PFERD/utils.py b/PFERD/utils.py index 08017aa..d7c61ec 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -4,6 +4,8 @@ import functools import getpass from typing import Any, Callable, Optional, TypeVar +import bs4 + T = TypeVar("T") @@ -23,6 +25,12 @@ async def ainput(prompt: str) -> str: async def agetpass(prompt: str) -> str: return await to_thread(lambda: getpass.getpass(prompt)) +def soupify(data: bytes) -> bs4.BeautifulSoup: + """ + Parses HTML to a beautifulsoup object. + """ + + return bs4.BeautifulSoup(data, "html.parser") async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: """ diff --git a/setup.cfg b/setup.cfg index f2806e2..18ff558 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,7 @@ install_requires = aiohttp>=3.7.4.post0 beautifulsoup4>=4.9.3 rich>=10.1.0 + beautifulsoup4>=4.9.3 [options.entry_points] console_scripts = From 868f4869225a4f4b5cd75a7483c8f8599f3a46f4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 17:12:25 +0200 Subject: [PATCH 120/524] Rename local crawler path to target --- CONFIG.md | 2 +- PFERD/crawlers/local.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index df3e8f2..22078ae 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -113,7 +113,7 @@ This crawler crawls a local directory. It is really simple and mostly useful for testing different setups. The various delay options are meant to make the crawler simulate a slower, network-based crawler. -- `path`: Path to the local directory to crawl. (Required) +- `target`: Path to the local directory to crawl. (Required) - `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl requests. (Default: 0.0) - `download_delay`: Maximum artificial delay (in seconds) to simulate for diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 360a9a9..2dde0d4 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -10,10 +10,10 @@ from ..crawler import Crawler, CrawlerSection, anoncritical class LocalCrawlerSection(CrawlerSection): - def path(self) -> Path: - value = self.s.get("path") + def target(self) -> Path: + value = self.s.get("target") if value is None: - self.missing_value("path") + self.missing_value("target") return Path(value).expanduser() def crawl_delay(self) -> float: @@ -48,7 +48,7 @@ class LocalCrawler(Crawler): ): super().__init__(name, section, config, conductor) - self._path = config.working_dir / section.path() + self._target = config.working_dir / section.target() self._crawl_delay = section.crawl_delay() self._download_delay = section.download_delay() self._download_speed = section.download_speed() @@ -59,7 +59,7 @@ class LocalCrawler(Crawler): self._block_size = 1024**2 # 1 MiB async def crawl(self) -> None: - await self._crawl_path(self._path, PurePath()) + await self._crawl_path(self._target, PurePath()) if self.error_free: await self.cleanup() From b70b62cef542b282c69071b5cf963ed91ead2b65 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 17:23:33 +0200 Subject: [PATCH 121/524] Make crawler sections start with "crawl:" Also, use only the part of the section name after the "crawl:" as the crawler's output directory. Now, the implementation matches the documentation again --- CONFIG.md | 2 +- PFERD/config.py | 2 +- PFERD/crawler.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 22078ae..11c4282 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -40,7 +40,7 @@ crawlers: - `type`: The types are specified in [this section](#crawler-types). - `output_dir`: The directory the crawler synchronizes files to. A crawler will - never place any files outside of this directory. (Default: crawler's name) + never place any files outside of this directory. (Default: the crawler's name) - `redownload`: When to download again a file that is already present locally. (Default: `never-smart`) - `never`: If a file is present locally, it is not downloaded again. diff --git a/PFERD/config.py b/PFERD/config.py index 66b882e..7a7e832 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -140,7 +140,7 @@ class Config: def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): - if name.startswith("crawler:"): + if name.startswith("crawl:"): result.append((name, proxy)) return result diff --git a/PFERD/crawler.py b/PFERD/crawler.py index f49eba8..4148614 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -110,6 +110,9 @@ def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: class CrawlerSection(Section): def output_dir(self, name: str) -> Path: + # TODO Use removeprefix() after switching to 3.9 + if name.startswith("crawl:"): + name = name[len("crawl:"):] return Path(self.s.get("output_dir", name)).expanduser() def redownload(self) -> Redownload: From d63494908dcbea7146ab4b62157878d15c15aedb Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 17:37:05 +0200 Subject: [PATCH 122/524] Properly invalidate exceptions The simple authenticator now properly invalidates its credentials. Also, the invalidation functions have been given better names and documentation. --- PFERD/authenticator.py | 30 +++++++++++++++++++++++++++--- PFERD/authenticators/simple.py | 25 ++++++++++++++++++++++--- PFERD/crawlers/ilias.py | 2 +- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/PFERD/authenticator.py b/PFERD/authenticator.py index 42d8bb9..b2f6164 100644 --- a/PFERD/authenticator.py +++ b/PFERD/authenticator.py @@ -42,11 +42,35 @@ class Authenticator(ABC): async def credentials(self) -> Tuple[str, str]: pass - def invalid_credentials(self) -> None: + def invalidate_credentials(self) -> None: + """ + Tell the authenticator that some or all of its credentials are invalid. + + Authenticators should overwrite this function if they have a way to + deal with this issue that is likely to result in valid credentials + (e. g. prompting the user). + """ + raise AuthException("Invalid credentials") - def invalid_username(self) -> None: + def invalidate_username(self) -> None: + """ + Tell the authenticator that specifically its username is invalid. + + Authenticators should overwrite this function if they have a way to + deal with this issue that is likely to result in valid credentials + (e. g. prompting the user). + """ + raise AuthException("Invalid username") - def invalid_password(self) -> None: + def invalidate_password(self) -> None: + """ + Tell the authenticator that specifically its password is invalid. + + Authenticators should overwrite this function if they have a way to + deal with this issue that is likely to result in valid credentials + (e. g. prompting the user). + """ + raise AuthException("Invalid password") diff --git a/PFERD/authenticators/simple.py b/PFERD/authenticators/simple.py index 3a57faf..6ce6265 100644 --- a/PFERD/authenticators/simple.py +++ b/PFERD/authenticators/simple.py @@ -1,6 +1,6 @@ from typing import Optional, Tuple -from ..authenticator import Authenticator, AuthSection +from ..authenticator import Authenticator, AuthException, AuthSection from ..conductor import TerminalConductor from ..config import Config from ..utils import agetpass, ainput @@ -42,7 +42,26 @@ class SimpleAuthenticator(Authenticator): if self.password is None: self.password = await agetpass("Password: ") - else: - print("Password: *******") return self.username, self.password + + def invalidate_credentials(self) -> None: + if self.username_fixed and self.password_fixed: + raise AuthException("Configured credentials are invalid") + + if not self.username_fixed: + self.username = None + if not self.password_fixed: + self.password = None + + def invalidate_username(self) -> None: + if self.username_fixed: + raise AuthException("Configured username is invalid") + else: + self.username = None + + def invalidate_password(self) -> None: + if self.password_fixed: + raise AuthException("Configured password is invalid") + else: + self.password = None diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 84a7c15..ed3fd9c 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -156,7 +156,7 @@ class KitShibbolethLogin: soup = await self._authenticate_tfa(sess, soup) if not self._login_successful(soup): - self._auth.invalid_credentials() + self._auth.invalidate_credentials() # Equivalent: Being redirected via JS automatically # (or clicking "Continue" if you have JS disabled) From 8c32da7f19ef613b136288a1a8f9a4ab06433c09 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 18:24:03 +0200 Subject: [PATCH 123/524] Let authenticators provide username and password separately --- CONFIG.md | 2 +- PFERD/authenticator.py | 8 +++++++ PFERD/authenticators/simple.py | 42 ++++++++++++++++++---------------- PFERD/crawlers/ilias.py | 2 +- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 11c4282..ca6d92b 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -93,7 +93,7 @@ auth = auth:example ## The `auth:*` sections Sections whose names start with `auth:` are used to configure authenticators. An -authenticator provides login credentials to one or more crawlers. +authenticator provides a username and a password to one or more crawlers. Authenticators work similar to crawlers: A section represents an authenticator instance, whose name is the rest of the section name. The type is specified by diff --git a/PFERD/authenticator.py b/PFERD/authenticator.py index b2f6164..7475e2a 100644 --- a/PFERD/authenticator.py +++ b/PFERD/authenticator.py @@ -42,6 +42,14 @@ class Authenticator(ABC): async def credentials(self) -> Tuple[str, str]: pass + async def username(self) -> str: + username, _ = await self.credentials() + return username + + async def password(self) -> str: + _, password = await self.credentials() + return password + def invalidate_credentials(self) -> None: """ Tell the authenticator that some or all of its credentials are invalid. diff --git a/PFERD/authenticators/simple.py b/PFERD/authenticators/simple.py index 6ce6265..f21661c 100644 --- a/PFERD/authenticators/simple.py +++ b/PFERD/authenticators/simple.py @@ -24,44 +24,46 @@ class SimpleAuthenticator(Authenticator): ) -> None: super().__init__(name, section, config, conductor) - self.username = section.username() - self.password = section.password() + self._username = section.username() + self._password = section.password() - self.username_fixed = self.username is not None - self.password_fixed = self.password is not None + self._username_fixed = self.username is not None + self._password_fixed = self.password is not None async def credentials(self) -> Tuple[str, str]: - if self.username is not None and self.password is not None: - return self.username, self.password + if self._username is not None and self._password is not None: + return self._username, self._password async with self.conductor.exclusive_output(): - if self.username is None: - self.username = await ainput("Username: ") + if self._username is None: + self._username = await ainput("Username: ") else: print(f"Username: {self.username}") - if self.password is None: - self.password = await agetpass("Password: ") + if self._password is None: + self._password = await agetpass("Password: ") - return self.username, self.password + # Intentionally returned inside the context manager so we know + # they're both not None + return self._username, self._password def invalidate_credentials(self) -> None: - if self.username_fixed and self.password_fixed: + if self._username_fixed and self._password_fixed: raise AuthException("Configured credentials are invalid") - if not self.username_fixed: - self.username = None - if not self.password_fixed: - self.password = None + if not self._username_fixed: + self._username = None + if not self._password_fixed: + self._password = None def invalidate_username(self) -> None: - if self.username_fixed: + if self._username_fixed: raise AuthException("Configured username is invalid") else: - self.username = None + self._username = None def invalidate_password(self) -> None: - if self.password_fixed: + if self._password_fixed: raise AuthException("Configured password is invalid") else: - self.password = None + self._password = None diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index ed3fd9c..2352945 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -177,7 +177,7 @@ class KitShibbolethLogin: if not self._tfa_auth: raise RuntimeError("No 'tfa_auth' present but you use two-factor authentication!") - _, tfa_token = await self._tfa_auth.credentials() + tfa_token = await self._tfa_auth.password() # Searching the form here so that this fails before asking for # credentials rather than after asking. From e1104f888d761568e950c70437a72e2168d6c9e2 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 18:27:16 +0200 Subject: [PATCH 124/524] Add tfa authenticator --- CONFIG.md | 6 ++++++ PFERD/authenticators/__init__.py | 5 ++++- PFERD/authenticators/tfa.py | 37 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 PFERD/authenticators/tfa.py diff --git a/CONFIG.md b/CONFIG.md index ca6d92b..53c0706 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -131,6 +131,12 @@ via the terminal. - `username`: The username. (Optional) - `password`: The password. (Optional) +### The `tfa` authenticator + +This authenticator prompts the user on the console for a two-factor +authentication token. The token is provided as password and it is not cached. +This authenticator does not support usernames. + ## Transformation rules Transformation rules are rules for renaming and excluding files and directories. diff --git a/PFERD/authenticators/__init__.py b/PFERD/authenticators/__init__.py index d021d40..97ff03a 100644 --- a/PFERD/authenticators/__init__.py +++ b/PFERD/authenticators/__init__.py @@ -1,10 +1,11 @@ from configparser import SectionProxy from typing import Callable, Dict -from ..authenticator import Authenticator +from ..authenticator import Authenticator, AuthSection from ..conductor import TerminalConductor from ..config import Config from .simple import SimpleAuthenticator, SimpleAuthSection +from .tfa import TfaAuthenticator AuthConstructor = Callable[[ str, # Name (without the "auth:" prefix) @@ -16,4 +17,6 @@ AuthConstructor = Callable[[ AUTHENTICATORS: Dict[str, AuthConstructor] = { "simple": lambda n, s, c, t: SimpleAuthenticator(n, SimpleAuthSection(s), c, t), + "tfa": lambda n, s, c, t: + TfaAuthenticator(n, AuthSection(s), c, t), } diff --git a/PFERD/authenticators/tfa.py b/PFERD/authenticators/tfa.py new file mode 100644 index 0000000..3513d09 --- /dev/null +++ b/PFERD/authenticators/tfa.py @@ -0,0 +1,37 @@ +from typing import Tuple + +from ..authenticator import Authenticator, AuthException, AuthSection +from ..conductor import TerminalConductor +from ..config import Config +from ..utils import ainput + + +class TfaAuthenticator(Authenticator): + def __init__( + self, + name: str, + section: AuthSection, + config: Config, + conductor: TerminalConductor, + ) -> None: + super().__init__(name, section, config, conductor) + + async def username(self) -> str: + raise AuthException("TFA authenticator does not support usernames") + + async def password(self) -> str: + async with self.conductor.exclusive_output(): + code = await ainput("TFA code: ") + return code + + async def credentials(self) -> Tuple[str, str]: + raise AuthException("TFA authenticator does not support usernames") + + def invalidate_username(self) -> None: + raise AuthException("TFA authenticator does not support usernames") + + def invalidate_password(self) -> None: + pass + + def invalidate_credentials(self) -> None: + pass From 1123c8884d54822bf2a285fd9c6c423fa0eb1a2e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 18:57:17 +0200 Subject: [PATCH 125/524] Implement an IliasPage This allows PFERD to semantically understand ILIAS HTML and is the foundation for the ILIAS crawler. This patch extends the ILIAS crawler to crawl the personal desktop and print the elements on it. --- PFERD/crawlers/ilias.py | 338 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 335 insertions(+), 3 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 2352945..2d9a9c9 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -1,9 +1,15 @@ +import json +import re from configparser import SectionProxy +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum from pathlib import PurePath -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional +from urllib.parse import urljoin, urlparse import aiohttp -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from PFERD.utils import soupify from ..authenticators import Authenticator @@ -41,6 +47,330 @@ class IliasCrawlerSection(CrawlerSection): return auth +class IliasElementType(Enum): + EXERCISE = "exercise" + FILE = "file" + FOLDER = "folder" + FORUM = "forum" + LINK = "link" + MEETING = "meeting" + VIDEO = "video" + VIDEO_PLAYER = "video_player" + VIDEO_FOLDER = "video_folder" + VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + + +@dataclass +class IliasPageElement: + type: IliasElementType + url: str + name: str + mtime: Optional[datetime] = None + query_parameter: Dict[str, str] = field(default_factory=dict) + + +class IliasPage: + + def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): + self._soup = soup + self._page_url = _page_url + self._page_type = source_element.type if source_element else None + self._source_name = source_element.name if source_element else "" + + def get_child_elements(self) -> List[IliasPageElement]: + """ + Return all child page elements you can find here. + """ + if self._is_video_player(): + return self._player_to_video() + if self._is_video_listing(): + return self._find_video_entries() + return self._find_normal_entries() + + def _is_video_player(self) -> bool: + return "paella_config_file" in str(self._soup) + + def _is_video_listing(self) -> bool: + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "opencast" in element.attrs["src"].lower(): + return True + return False + + def _player_to_video(self) -> List[IliasPageElement]: + # Fetch the actual video page. This is a small wrapper page initializing a javscript + # player. Sadly we can not execute that JS. The actual video stream url is nowhere + # on the page, but defined in a JS object inside a script tag, passed to the player + # library. + # We do the impossible and RegEx the stream JSON object out of the page's HTML source + regex: re.Pattern[str] = re.compile( + r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE + ) + json_match = regex.search(str(self._soup)) + + if json_match is None: + print(f"Could not find json stream info for {self._page_url!r}") + return [] + json_str = json_match.group(1) + + # parse it + json_object = json.loads(json_str) + # and fetch the video url! + video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] + return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + + def _find_video_entries(self) -> List[IliasPageElement]: + # ILIAS has three stages for video pages + # 1. The initial dummy page without any videos. This page contains the link to the listing + # 2. The video listing which might be paginated + # 3. An unpaginated video listing (or at least one that includes 800 videos) + # + # We need to figure out where we are. + + video_element_table: Tag = self._soup.find( + name="table", id=re.compile(r"tbl_xoct_.+") + ) + + if video_element_table is None: + # We are in stage 1 + # The page is actually emtpy but contains the link to stage 2 + content_link: Tag = self._soup.select_one("#tab_series a") + url: str = self._abs_url_from_link(content_link) + query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} + return [IliasPageElement( + IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "", query_parameter=query_params + )] + + is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None + + if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + # We are in stage 2 - try to break pagination + return self._find_video_entries_paginated() + + return self._find_video_entries_no_paging() + + def _find_video_entries_paginated(self) -> List[IliasPageElement]: + table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) + + if table_element is None: + # TODO: Properly log this + print( + "Could not increase elements per page (table not found)." + " Some might not be crawled!" + ) + return self._find_video_entries_no_paging() + + id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) + if id_match is None: + # TODO: Properly log this + print( + "Could not increase elements per page (table id not found)." + " Some might not be crawled!" + ) + return self._find_video_entries_no_paging() + + table_id = id_match.group(1) + + query_params = {f"tbl_xoct_{table_id}_trows": "800", + "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} + return [IliasPageElement( + IliasElementType.VIDEO_FOLDER, self._page_url, "", query_parameter=query_params + )] + + def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + """ + Crawls the "second stage" video page. This page contains the actual video urls. + """ + # Video start links are marked with an "Abspielen" link + video_links: List[Tag] = self._soup.findAll( + name="a", text=re.compile(r"\s*Abspielen\s*") + ) + + results: List[IliasPageElement] = [] + + # TODO: Sadly the download button is currently broken, so never do that + for link in video_links: + results.append(self._listed_video_to_element(link)) + + return results + + def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + # The link is part of a table with multiple columns, describing metadata. + # 6th child (1 indexed) is the modification time string + modification_string = link.parent.parent.parent.select_one( + "td.std:nth-child(6)" + ).getText().strip() + modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + + title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() + title += ".mp4" + + video_name: str = _sanitize_path_name(title) + + video_url = self._abs_url_from_link(link) + + return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + + def _find_normal_entries(self) -> List[IliasPageElement]: + result: List[IliasPageElement] = [] + + # Fetch all links and throw them to the general interpreter + links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") + + for link in links: + abs_url = self._abs_url_from_link(link) + element_name = _sanitize_path_name(link.getText()) + element_type = self._find_type_from_link(element_name, link, abs_url) + + if not element_type: + continue + elif element_type == IliasElementType.MEETING: + element_path = _sanitize_path_name(self._normalize_meeting_name(element_name)) + elif element_type == IliasElementType.FILE: + result.append(self._file_to_element(element_name, abs_url, link)) + continue + + result.append(IliasPageElement(element_type, abs_url, element_name, None)) + + return result + + def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: + # Files have a list of properties (type, modification date, size, etc.) + # In a series of divs. + # Find the parent containing all those divs, so we can filter our what we need + properties_parent: Tag = link_element.findParent( + "div", {"class": lambda x: "il_ContainerListItem" in x} + ).select_one(".il_ItemProperties") + # The first one is always the filetype + file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() + + # The rest does not have a stable order. Grab the whole text and reg-ex the date + # out of it + all_properties_text = properties_parent.getText().strip() + modification_date_match = re.search( + r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", + all_properties_text + ) + if modification_date_match is None: + modification_date = None + # TODO: Properly log this + print(f"Could not extract start date from {all_properties_text!r}") + else: + modification_date_str = modification_date_match.group(1) + modification_date = demangle_date(modification_date_str) + + # Grab the name from the link text + name = _sanitize_path_name(link_element.getText()) + full_path = name + "." + file_type + + return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) + + @staticmethod + def _find_type_from_link( + element_name: str, + link_element: Tag, + url: str + ) -> Optional[IliasElementType]: + """ + Decides which sub crawler to use for a given top level element. + """ + parsed_url = urlparse(url) + + # file URLs contain "target=file" + if "target=file_" in parsed_url.query: + return IliasElementType.FILE + + # Skip forums + if "cmd=showThreads" in parsed_url.query: + return IliasElementType.FORUM + + # Everything with a ref_id can *probably* be opened to reveal nested things + # video groups, directories, exercises, etc + if "ref_id=" in parsed_url.query: + return IliasPage._find_type_from_folder_like(link_element, url) + + # TODO: Log this properly + print(f"Unknown type: The element was at {str(element_name)!r} and it is {link_element!r})") + return None + + @staticmethod + def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: + """ + Try crawling something that looks like a folder. + """ + # pylint: disable=too-many-return-statements + + found_parent: Optional[Tag] = None + + # We look for the outer div of our inner link, to find information around it + # (mostly the icon) + for parent in link_element.parents: + if "ilContainerListItemOuter" in parent["class"]: + found_parent = parent + break + + if found_parent is None: + # TODO: Log this properly + print(f"Could not find element icon for {url!r}") + return None + + # Find the small descriptive icon to figure out the type + img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") + + if img_tag is None: + # TODO: Log this properly + print(f"Could not find image tag for {url!r}") + return None + + if "opencast" in str(img_tag["alt"]).lower(): + return IliasElementType.VIDEO_FOLDER + + if str(img_tag["src"]).endswith("icon_exc.svg"): + return IliasElementType.EXERCISE + + if str(img_tag["src"]).endswith("icon_webr.svg"): + return IliasElementType.LINK + + if str(img_tag["src"]).endswith("frm.svg"): + return IliasElementType.FORUM + + if str(img_tag["src"]).endswith("sess.svg"): + return IliasElementType.MEETING + + return IliasElementType.FOLDER + + @staticmethod + def _normalize_meeting_name(meeting_name: str) -> str: + """ + Normalizes meeting names, which have a relative time as their first part, + to their date in ISO format. + """ + date_portion_str = meeting_name.split(" - ")[0] + date_portion = demangle_date(date_portion_str) + + if not date_portion: + return meeting_name + + rest_of_name = meeting_name + if rest_of_name.startswith(date_portion_str): + rest_of_name = rest_of_name[len(date_portion_str):] + + return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + + def _abs_url_from_link(self, link_tag: Tag) -> str: + """ + Create an absolute url from an tag. + """ + return urljoin(self._page_url, link_tag.get("href")) + +def demangle_date(date_str: str) -> Optional[datetime]: + return None + + +def _sanitize_path_name(name: str) -> str: + return name.replace("/", "-").replace("\\", "-").strip() + + class IliasCrawler(HttpCrawler): def __init__( self, @@ -64,7 +394,9 @@ class IliasCrawler(HttpCrawler): async def crawl(self) -> None: async with self.crawl_bar(PurePath("/")) as bar: soup = await self._get_page(self._base_url) - self.print("[green]Gotcha![/]") + page = IliasPage(soup, self._base_url, None) + for element in page.get_child_elements(): + self.print(element.name + " " + str(element.type)) async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: if retries_left < 0: From c7494e32ce6de3f5b9ab8e717a15bdfd43dbf766 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 20:42:18 +0200 Subject: [PATCH 126/524] Start implementing crawling in ILIAS crawler The ilias crawler can now crawl quite a few filetypes, splits off folders and crawls them concurrently. --- PFERD/crawlers/ilias.py | 182 +++++++++++++++++++++++++++++++++------- 1 file changed, 152 insertions(+), 30 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 2d9a9c9..39c7184 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -1,3 +1,4 @@ +import asyncio import json import re from configparser import SectionProxy @@ -5,8 +6,9 @@ from dataclasses import dataclass, field from datetime import datetime from enum import Enum from pathlib import PurePath -from typing import Any, Dict, List, Optional -from urllib.parse import urljoin, urlparse +from typing import Any, Dict, List, Optional, Set, Union +from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, + urlunsplit) import aiohttp from bs4 import BeautifulSoup, Tag @@ -18,23 +20,27 @@ from ..config import Config from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, arepeat) +TargetType = Union[str, int] + class IliasCrawlerSection(CrawlerSection): - def __init__(self, section: SectionProxy): - super().__init__(section) + def target(self) -> TargetType: + target = self.s.get("target") + if not target: + self.missing_value("target") - if not self.course_id() and not self.element_url(): - self.missing_value("course_id or element_url") + if re.fullmatch(r"\d+", target): + # Course id + return int(target) + if target == "desktop": + # Full personal desktop + return target + if target.startswith("https://ilias.studium.kit.edu"): + # ILIAS URL + return target - def course_id(self) -> Optional[str]: - return self.s.get("course_id") - - def element_url(self) -> Optional[str]: - return self.s.get("element_url") - - def base_url(self) -> str: - return self.s.get("ilias_url", "https://ilias.studium.kit.edu/") + self.invalid_value("target", target, "Should be ") def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: value = self.s.get("tfa_auth") @@ -66,7 +72,6 @@ class IliasPageElement: url: str name: str mtime: Optional[datetime] = None - query_parameter: Dict[str, str] = field(default_factory=dict) class IliasPage: @@ -91,11 +96,17 @@ class IliasPage: return "paella_config_file" in str(self._soup) def _is_video_listing(self) -> bool: + # ILIAS fluff around it if self._soup.find(id="headerimage"): element: Tag = self._soup.find(id="headerimage") if "opencast" in element.attrs["src"].lower(): return True - return False + + # Raw listing without ILIAS fluff + video_element_table: Tag = self._soup.find( + name="table", id=re.compile(r"tbl_xoct_.+") + ) + return video_element_table is not None def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript @@ -137,9 +148,8 @@ class IliasPage: content_link: Tag = self._soup.select_one("#tab_series a") url: str = self._abs_url_from_link(content_link) query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - return [IliasPageElement( - IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "", query_parameter=query_params - )] + url = _url_set_query_params(url, query_params) + return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None @@ -173,9 +183,8 @@ class IliasPage: query_params = {f"tbl_xoct_{table_id}_trows": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - return [IliasPageElement( - IliasElementType.VIDEO_FOLDER, self._page_url, "", query_parameter=query_params - )] + url = _url_set_query_params(self._page_url, query_params) + return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] def _find_video_entries_no_paging(self) -> List[IliasPageElement]: """ @@ -363,6 +372,7 @@ class IliasPage: """ return urljoin(self._page_url, link_tag.get("href")) + def demangle_date(date_str: str) -> Optional[datetime]: return None @@ -371,6 +381,36 @@ def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() +def _url_set_query_param(url: str, param: str, value: str) -> str: + """ + Set a query parameter in an url, overwriting existing ones with the same name. + """ + scheme, netloc, path, query, fragment = urlsplit(url) + query_parameters = parse_qs(query) + query_parameters[param] = [value] + new_query_string = urlencode(query_parameters, doseq=True) + + return urlunsplit((scheme, netloc, path, new_query_string, fragment)) + + +def _url_set_query_params(url: str, params: Dict[str, str]) -> str: + result = url + + for key, val in params.items(): + result = _url_set_query_param(result, key, val) + + return result + + +_DIRECTORY_PAGES: Set[IliasElementType] = set([ + IliasElementType.EXERCISE, + IliasElementType.FOLDER, + IliasElementType.MEETING, + IliasElementType.VIDEO_FOLDER, + IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, +]) + + class IliasCrawler(HttpCrawler): def __init__( self, @@ -386,22 +426,104 @@ class IliasCrawler(HttpCrawler): section.auth(authenticators), section.tfa_auth(authenticators) ) - self._base_url = section.base_url() + self._base_url = "https://ilias.studium.kit.edu" - self._course_id = section.course_id() - self._element_url = section.element_url() + self._target = section.target() async def crawl(self) -> None: - async with self.crawl_bar(PurePath("/")) as bar: - soup = await self._get_page(self._base_url) - page = IliasPage(soup, self._base_url, None) - for element in page.get_child_elements(): - self.print(element.name + " " + str(element.type)) + if isinstance(self._target, int): + await self._crawl_course(self._target) + elif self._target == "desktop": + await self._crawl_desktop() + else: + await self._crawl_url(self._target) + + async def _crawl_course(self, course_id: int) -> None: + # Start crawling at the given course + root_url = _url_set_query_param( + self._base_url + "/goto.php", "target", f"crs_{course_id}" + ) + + await self._crawl_url(root_url, expected_id=course_id) + + async def _crawl_desktop(self) -> None: + await self._crawl_url(self._base_url) + + async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: + tasks = [] + + async with self.crawl_bar(PurePath("Root element")): + soup = await self._get_page(url) + + if expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + # TODO: Properly handle error + raise RuntimeError( + "Invalid course id? I didn't find anything looking like a course!") + + # Duplicated code, but the root page is special - we want to void fetching it twice! + page = IliasPage(soup, url, None) + for child in page.get_child_elements(): + tasks.append(self._handle_ilias_element(PurePath("."), child)) + await asyncio.gather(*tasks) + + async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: + tasks = [] + async with self.crawl_bar(path): + soup = await self._get_page(url) + page = IliasPage(soup, url, parent) + + for child in page.get_child_elements(): + tasks.append(self._handle_ilias_element(path, child)) + + await asyncio.gather(*tasks) + + async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: + element_path = PurePath(parent_path, element.name) + + if element.type == IliasElementType.FILE: + await self._download_element(element, element_path) + elif element.type == IliasElementType.FORUM: + # TODO: Delete + self.print(f"Skipping forum [green]{element_path}[/]") + elif element.type == IliasElementType.LINK: + # TODO: Write in meta-redirect file + self.print(f"Skipping link [green]{element_path}[/]") + elif element.type == IliasElementType.VIDEO: + await self._download_element(element, element_path) + elif element.type == IliasElementType.VIDEO_PLAYER: + # FIXME: Check if we should look at this and if not bail out already! + # This saves us a request for each video, if we skip them anyways + raise RuntimeError("IMPLEMENT ME") + elif element.type in _DIRECTORY_PAGES: + await self._handle_ilias_page(element.url, element, element_path) + else: + # TODO: Proper exception + raise RuntimeError(f"Unknown type: {element.type!r}") + + async def _download_element(self, element: IliasPageElement, element_path: PurePath) -> None: + dl = await self.download(element_path, mtime=element.mtime) + if not dl: + return + + async with self.download_bar(element_path) as bar, dl as sink,\ + self.session.get(element.url) as resp: + + if resp.content_length: + bar.set_total(resp.content_length) + + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) + + sink.done() async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: if retries_left < 0: # TODO: Proper exception raise RuntimeError("Get page failed too often") + print(url) async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): From 7d323ec62b661c4d3b90460af2f87d200f63047a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 21:29:43 +0200 Subject: [PATCH 127/524] Implement video downloads in ilias crawler --- PFERD/crawlers/ilias.py | 55 +++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 39c7184..2f3920c 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -6,12 +6,14 @@ from dataclasses import dataclass, field from datetime import datetime from enum import Enum from pathlib import PurePath -from typing import Any, Dict, List, Optional, Set, Union +# TODO In Python 3.9 and above, AsyncContextManager is deprecated +from typing import Any, AsyncContextManager, Dict, List, Optional, Set, Union from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit) import aiohttp from bs4 import BeautifulSoup, Tag +from PFERD.output_dir import Redownload from PFERD.utils import soupify from ..authenticators import Authenticator @@ -19,6 +21,7 @@ from ..conductor import TerminalConductor from ..config import Config from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, arepeat) +from ..output_dir import FileSink TargetType = Union[str, int] @@ -438,6 +441,9 @@ class IliasCrawler(HttpCrawler): else: await self._crawl_url(self._target) + if self.error_free: + await self.cleanup() + async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course root_url = _url_set_query_param( @@ -483,7 +489,7 @@ class IliasCrawler(HttpCrawler): element_path = PurePath(parent_path, element.name) if element.type == IliasElementType.FILE: - await self._download_element(element, element_path) + await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: # TODO: Delete self.print(f"Skipping forum [green]{element_path}[/]") @@ -491,33 +497,50 @@ class IliasCrawler(HttpCrawler): # TODO: Write in meta-redirect file self.print(f"Skipping link [green]{element_path}[/]") elif element.type == IliasElementType.VIDEO: - await self._download_element(element, element_path) + await self._download_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: - # FIXME: Check if we should look at this and if not bail out already! - # This saves us a request for each video, if we skip them anyways - raise RuntimeError("IMPLEMENT ME") + await self._download_video(element, element_path) elif element.type in _DIRECTORY_PAGES: await self._handle_ilias_page(element.url, element, element_path) else: # TODO: Proper exception raise RuntimeError(f"Unknown type: {element.type!r}") - async def _download_element(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: + # Videos will NOT be redownloaded - their content doesn't really change and they are chunky + dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) + if not dl: + return + + async with self.download_bar(element_path) as bar: + page = IliasPage(await self._get_page(element.url), element.url, element) + real_element = page.get_child_elements()[0] + + async with dl as sink, self.session.get(element.url) as resp: + if resp.content_length: + bar.set_total(resp.content_length) + + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) + + sink.done() + + async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: return - async with self.download_bar(element_path) as bar, dl as sink,\ - self.session.get(element.url) as resp: + async with self.download_bar(element_path) as bar: + async with dl as sink, self.session.get(element.url) as resp: + if resp.content_length: + bar.set_total(resp.content_length) - if resp.content_length: - bar.set_total(resp.content_length) + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) - async for data in resp.content.iter_chunked(1024): - sink.file.write(data) - bar.advance(len(data)) - - sink.done() + sink.done() async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: if retries_left < 0: From c454fabc9db22f8389f3a35c951a2e15bfaee39e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 21:40:17 +0200 Subject: [PATCH 128/524] Add support for exercises in ILIAS crawler --- PFERD/crawlers/ilias.py | 49 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 2f3920c..e52d329 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -93,6 +93,8 @@ class IliasPage: return self._player_to_video() if self._is_video_listing(): return self._find_video_entries() + if self._is_exercise_file(): + return self._find_exercise_entries() return self._find_normal_entries() def _is_video_player(self) -> bool: @@ -111,6 +113,19 @@ class IliasPage: ) return video_element_table is not None + def _is_exercise_file(self) -> bool: + # we know it from before + if self._page_type == IliasElementType.EXERCISE: + return True + + # We have no suitable parent - let's guesss + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "exc" in element.attrs["src"].lower(): + return True + + return False + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -223,6 +238,40 @@ class IliasPage: return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + def _find_exercise_entries(self) -> List[IliasPageElement]: + results: List[IliasPageElement] = [] + + # Each assignment is in an accordion container + assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") + + for container in assignment_containers: + # Fetch the container name out of the header to use it in the path + container_name = container.select_one(".ilAssignmentHeader").getText().strip() + # Find all download links in the container (this will contain all the files) + files: List[Tag] = container.findAll( + name="a", + # download links contain the given command class + attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, + text="Download" + ) + + # Grab each file as you now have the link + for file_link in files: + # Two divs, side by side. Left is the name, right is the link ==> get left + # sibling + file_name = file_link.parent.findPrevious(name="div").getText().strip() + file_name = _sanitize_path_name(file_name) + url = self._abs_url_from_link(file_link) + + results.append(IliasPageElement( + IliasElementType.FILE, + url, + container_name + "/" + file_name, + None # We do not have any timestamp + )) + + return results + def _find_normal_entries(self) -> List[IliasPageElement]: result: List[IliasPageElement] = [] From 05573ccc53cf4a9e446bf5e010c263670d3002f5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 21:33:51 +0200 Subject: [PATCH 129/524] Add fancy CLI options --- PFERD/__main__.py | 233 +++++++++++++++++++++++++++++++++++++++----- PFERD/config.py | 13 +-- PFERD/crawler.py | 44 +++------ PFERD/output_dir.py | 16 +++ 4 files changed, 250 insertions(+), 56 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index a16b19b..5815f40 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -1,40 +1,229 @@ import argparse import asyncio +import configparser from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException +from .output_dir import OnConflict, Redownload from .pferd import Pferd +GENERAL_PARSER = argparse.ArgumentParser(add_help=False) +GENERAL_PARSER.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="custom config file" +) +GENERAL_PARSER.add_argument( + "--dump-config", + nargs="?", + const=True, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified" +) +GENERAL_PARSER.add_argument( + "--crawler", + action="append", + type=str, + metavar="NAME", + help="only execute a single crawler." + " Can be specified multiple times to execute multiple crawlers" +) +GENERAL_PARSER.add_argument( + "--working-dir", + type=Path, + metavar="PATH", + help="custom working directory" +) + + +def load_general( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + section = parser[parser.default_section] + + if args.working_dir is not None: + section["working_dir"] = str(args.working_dir) + + +CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) +CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( + title="general crawler arguments", + description="arguments common to all crawlers", +) +CRAWLER_PARSER_GROUP.add_argument( + "--redownload", + type=Redownload.from_string, + metavar="OPTION", + help="when to redownload a file that's already present locally" +) +CRAWLER_PARSER_GROUP.add_argument( + "--on-conflict", + type=OnConflict.from_string, + metavar="OPTION", + help="what to do when local and remote files or directories differ" +) +CRAWLER_PARSER_GROUP.add_argument( + "--transform", "-t", + action="append", + type=str, + metavar="RULE", + help="add a single transformation rule. Can be specified multiple times" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-tasks", + type=int, + metavar="N", + help="maximum number of concurrent tasks (crawling, downloading)" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-downloads", + type=int, + metavar="N", + help="maximum number of tasks that may download data at the same time" +) +CRAWLER_PARSER_GROUP.add_argument( + "--delay-between-tasks", + type=float, + metavar="SECONDS", + help="time the crawler should wait between subsequent tasks" +) + + +def load_crawler( + args: argparse.Namespace, + section: configparser.SectionProxy, +) -> None: + if args.redownload is not None: + section["redownload"] = args.redownload.value + if args.on_conflict is not None: + section["on_conflict"] = args.on_conflict.value + if args.transform is not None: + section["transform"] = "\n" + "\n".join(args.transform) + if args.max_concurrent_tasks is not None: + section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) + if args.max_concurrent_downloads is not None: + section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) + if args.delay_between_tasks is not None: + section["delay_between_tasks"] = str(args.delay_between_tasks) + + +PARSER = argparse.ArgumentParser(parents=[GENERAL_PARSER]) +PARSER.set_defaults(command=None) +SUBPARSERS = PARSER.add_subparsers(title="crawlers") + + +LOCAL_CRAWLER = SUBPARSERS.add_parser( + "local", + parents=[GENERAL_PARSER, CRAWLER_PARSER], +) +LOCAL_CRAWLER.set_defaults(command="local") +LOCAL_CRAWLER_GROUP = LOCAL_CRAWLER.add_argument_group( + title="local crawler arguments", + description="arguments for the 'local' crawler", +) +LOCAL_CRAWLER_GROUP.add_argument( + "target", + type=Path, + metavar="TARGET", + help="directory to crawl" +) +LOCAL_CRAWLER_GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--crawl-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for crawl requests" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--download-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for download requests" +) +LOCAL_CRAWLER_GROUP.add_argument( + "--download-speed", + type=int, + metavar="BYTES_PER_SECOND", + help="download speed to simulate" +) + + +def load_local_crawler( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + parser["crawl:local"] = {} + section = parser["crawl:local"] + load_crawler(args, section) + + section["type"] = "local" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + if args.crawl_delay is not None: + section["crawl_delay"] = str(args.crawl_delay) + if args.download_delay is not None: + section["download_delay"] = str(args.download_delay) + if args.download_speed is not None: + section["download_speed"] = str(args.download_speed) + + +def load_parser( + args: argparse.Namespace, +) -> configparser.ConfigParser: + parser = configparser.ConfigParser() + + if args.command is None: + Config.load_parser(parser, path=args.config) + elif args.command == "local": + load_local_crawler(args, parser) + + load_general(args, parser) + prune_crawlers(args, parser) + + return parser + + +def prune_crawlers( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + if not args.crawler: + return + + for section in parser.sections(): + if section.startswith("crawl:"): + # TODO Use removeprefix() when switching to 3.9 + name = section[len("crawl:"):] + if name not in args.crawler: + parser.remove_section(section) + + # TODO Check if crawlers actually exist + def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "--config", "-c", - type=Path, - metavar="PATH", - help="specify custom config file path", - ) - parser.add_argument( - "--dump-config", - nargs="?", - const=True, - type=Path, - metavar="PATH", - help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified", - ) - args = parser.parse_args() + args = PARSER.parse_args() try: - config_parser = Config.load_parser(args.config) - config = Config(config_parser) + config = Config(load_parser(args)) except ConfigLoadException: exit(1) - if args.dump_config: - path = None if args.dump_config is True else args.dump_config + if args.dump_config is not None: try: - config.dump(path) + if args.dump_config is True: + config.dump() + elif args.dump_config == "-": + config.dump_to_stdout() + else: + config.dump(Path(args.dump_config)) except ConfigDumpException: exit(1) exit() diff --git a/PFERD/config.py b/PFERD/config.py index 7a7e832..7fe5d9e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -1,4 +1,6 @@ +import asyncio import os +import sys from configparser import ConfigParser, SectionProxy from dataclasses import dataclass from pathlib import Path @@ -68,7 +70,7 @@ class Config: raise ConfigLoadException() @staticmethod - def load_parser(path: Optional[Path] = None) -> ConfigParser: + def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: """ May throw a ConfigLoadException. """ @@ -76,8 +78,6 @@ class Config: if not path: path = Config._default_path() - parser = ConfigParser() - # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. try: @@ -90,8 +90,6 @@ class Config: except PermissionError: Config._fail_load(path, "Insufficient permissions") - return parser - @staticmethod def _fail_dump(path: Path, reason: str) -> None: print(f"Failed to dump config file to {path}") @@ -123,7 +121,7 @@ class Config: self._parser.write(f) except FileExistsError: print("That file already exists.") - if prompt_yes_no("Overwrite it?", default=False): + if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): with open(path, "w") as f: self._parser.write(f) else: @@ -133,6 +131,9 @@ class Config: except PermissionError: self._fail_dump(path, "Insufficient permissions") + def dump_to_stdout(self) -> None: + self._parser.write(sys.stdout) + @property def default_section(self) -> SectionProxy: return self._parser[self._parser.default_section] diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4148614..140ae20 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -117,37 +117,25 @@ class CrawlerSection(Section): def redownload(self) -> Redownload: value = self.s.get("redownload", "never-smart") - if value == "never": - return Redownload.NEVER - elif value == "never-smart": - return Redownload.NEVER_SMART - elif value == "always": - return Redownload.ALWAYS - elif value == "always-smart": - return Redownload.ALWAYS_SMART - - self.invalid_value( - "redownload", - value, - "Must be 'never', 'never-smart', 'always' or 'always-smart'" - ) + try: + return Redownload.from_string(value) + except ValueError as e: + self.invalid_value( + "redownload", + value, + str(e).capitalize(), + ) def on_conflict(self) -> OnConflict: value = self.s.get("on_conflict", "prompt") - if value == "prompt": - return OnConflict.PROMPT - elif value == "local-first": - return OnConflict.LOCAL_FIRST - elif value == "remote-first": - return OnConflict.REMOTE_FIRST - elif value == "no-delete": - return OnConflict.NO_DELETE - - self.invalid_value( - "on_conflict", - value, - "Must be 'prompt', 'local-first', 'remote-first' or 'no-delete'", - ) + try: + return OnConflict.from_string(value) + except ValueError as e: + self.invalid_value( + "on_conflict", + value, + str(e).capitalize(), + ) def transform(self) -> str: return self.s.get("transform", "") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 89c5839..4f5f708 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -32,6 +32,14 @@ class Redownload(Enum): ALWAYS = "always" ALWAYS_SMART = "always-smart" + @staticmethod + def from_string(string: str) -> "Redownload": + try: + return Redownload(string) + except ValueError: + raise ValueError("must be one of 'never', 'never-smart'," + " 'always', 'always-smart'") + class OnConflict(Enum): PROMPT = "prompt" @@ -39,6 +47,14 @@ class OnConflict(Enum): REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + @staticmethod + def from_string(string: str) -> "OnConflict": + try: + return OnConflict(string) + except ValueError: + raise ValueError("must be one of 'prompt', 'local-first'," + " 'remote-first', 'no-delete'") + @dataclass class Heuristics: From 989032fe0c3b90aa5c034657ade7df54b1b2016f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 22:25:41 +0200 Subject: [PATCH 130/524] Fix cookies getting deleted --- PFERD/output_dir.py | 2 +- PFERD/report.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 4f5f708..fa0944b 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -378,7 +378,7 @@ class OutputDirectory: pass async def _cleanup_file(self, path: Path, pure: PurePath) -> None: - if self._report.marked(pure): + if self._report.is_marked(pure): return if await self._conflict_delete_lf(self._on_conflict, pure): diff --git a/PFERD/report.py b/PFERD/report.py index 2c7d8af..1c46216 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -62,7 +62,7 @@ class Report: detail, see the respective exception's docstring. """ - for other in self.known_files & self.reserved_files: + for other in self.marked: if path == other: raise MarkDuplicateException(path) @@ -71,8 +71,12 @@ class Report: self.known_files.add(path) - def marked(self, path: PurePath) -> bool: - return path in self.known_files + @property + def marked(self) -> Set[PurePath]: + return self.known_files | self.reserved_files + + def is_marked(self, path: PurePath) -> bool: + return path in self.marked def add_file(self, path: PurePath) -> None: """ From 9fd356d29044ac4b9a3ad36c464601048338d0b1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 May 2021 23:00:40 +0200 Subject: [PATCH 131/524] Ensure tmp files are deleted This doesn't seem to fix the case where an exception bubbles up to the top of the event loop. It also doesn't seem to fix the case when a KeyboardInterrupt is thrown, since that never makes its way into the event loop in the first place. Both of these cases lead to the event loop stopping, which means that the tmp file cleanup doesn't get executed even though it's inside a "with" or "finally". --- PFERD/output_dir.py | 59 +++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index fa0944b..23d4a31 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -3,13 +3,14 @@ import os import random import shutil import string -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import AsyncContextManager, AsyncIterator, BinaryIO, Optional +from typing import (AsyncContextManager, AsyncIterator, BinaryIO, Iterator, + Optional) from rich.markup import escape @@ -327,36 +328,42 @@ class OutputDirectory: mtimestamp = mtime.timestamp() os.utime(info.local_path, times=(mtimestamp, mtimestamp)) + @contextmanager + def _ensure_deleted(self, path: Path) -> Iterator[None]: + try: + yield + finally: + path.unlink(missing_ok=True) + async def _after_download(self, info: DownloadInfo) -> None: - changed = False + with self._ensure_deleted(info.tmp_path): + changed = False - if not info.success: - info.tmp_path.unlink() - return - - # Solve conflicts arising from existing local file - if info.local_path.exists(): - changed = True - if filecmp.cmp(info.local_path, info.tmp_path): - self._update_metadata(info) - info.tmp_path.unlink() + if not info.success: return - if not await self._conflict_lfrf(info.on_conflict, info.path): - info.tmp_path.unlink() - return + # Solve conflicts arising from existing local file + if info.local_path.exists(): + changed = True - info.tmp_path.replace(info.local_path) - self._update_metadata(info) + if filecmp.cmp(info.local_path, info.tmp_path): + self._update_metadata(info) + return - if changed: - self._conductor.print( - f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") - self._report.change_file(info.path) - else: - self._conductor.print( - f"[bold bright_green]Added[/] {escape(str(info.path))}") - self._report.add_file(info.path) + if not await self._conflict_lfrf(info.on_conflict, info.path): + return + + info.tmp_path.replace(info.local_path) + self._update_metadata(info) + + if changed: + self._conductor.print( + f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") + self._report.change_file(info.path) + else: + self._conductor.print( + f"[bold bright_green]Added[/] {escape(str(info.path))}") + self._report.add_file(info.path) async def cleanup(self) -> None: await self._cleanup_dir(self._root, PurePath()) From cf6903d109fead73a622351f98d24b05d013e93a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 15 May 2021 22:46:26 +0200 Subject: [PATCH 132/524] Retry crawling on I/O failure --- PFERD/crawlers/ilias.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index e52d329..b3190c6 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -596,12 +596,15 @@ class IliasCrawler(HttpCrawler): # TODO: Proper exception raise RuntimeError("Get page failed too often") print(url) - async with self.session.get(url) as request: - soup = soupify(await request.read()) - if self._is_logged_in(soup): - return soup + try: + async with self.session.get(url) as request: + soup = soupify(await request.read()) + if self._is_logged_in(soup): + return soup - await self._shibboleth_login.login(self.session) + await self._shibboleth_login.login(self.session) + except Exception: + return await self._get_page(url, retries_left - 1) return await self._get_page(url, retries_left - 1) From 9ec0d3e16ac756e0ce5913f6a1bb30add1985e1f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 11:54:42 +0200 Subject: [PATCH 133/524] Implement date-demangling in ILIAS crawler --- PFERD/crawlers/ilias.py | 50 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index b3190c6..18d33ff 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -3,7 +3,7 @@ import json import re from configparser import SectionProxy from dataclasses import dataclass, field -from datetime import datetime +from datetime import date, datetime, timedelta from enum import Enum from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated @@ -424,9 +424,55 @@ class IliasPage: """ return urljoin(self._page_url, link_tag.get("href")) +german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] +english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] def demangle_date(date_str: str) -> Optional[datetime]: - return None + """ + Demangle a given date in one of the following formats: + "Gestern, HH:MM" + "Heute, HH:MM" + "Morgen, HH:MM" + "dd. mon yyyy, HH:MM + """ + try: + date_str = re.sub(r"\s+", " ", date_str) + date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) + date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) + date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + for german, english in zip(german_months, english_months): + date_str = date_str.replace(german, english) + # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" + date_str = date_str.replace(english + ".", english) + + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" + day_part, time_part = date_str.split(",") + day_str, month_str, year_str = day_part.split(" ") + + day = int(day_str.strip().replace(".", "")) + month = english_months.index(month_str.strip()) + 1 + year = int(year_str.strip()) + + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + + return datetime(year, month, day, hour, minute) + except Exception: + # TODO: Properly log this + print(f"Could not parse date {date_str!r}") + return None + +def _format_date_english(date: date) -> str: + month = english_months[date.month - 1] + return f"{date.day:02d}. {month} {date.year:04d}" + +def _yesterday() -> date: + return date.today() - timedelta(days=1) + + +def _tomorrow() -> date: + return date.today() + timedelta(days=1) def _sanitize_path_name(name: str) -> str: From 1c226c31aae2e4eeac28eb0a8238485b7854098c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:01:30 +0200 Subject: [PATCH 134/524] Add some repeat annotations to the ILIAS crawler --- PFERD/crawlers/ilias.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 18d33ff..3f09789 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -550,6 +550,7 @@ class IliasCrawler(HttpCrawler): async def _crawl_desktop(self) -> None: await self._crawl_url(self._base_url) + @arepeat(3) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: tasks = [] @@ -567,8 +568,11 @@ class IliasCrawler(HttpCrawler): page = IliasPage(soup, url, None) for child in page.get_child_elements(): tasks.append(self._handle_ilias_element(PurePath("."), child)) + await asyncio.gather(*tasks) + @arepeat(3) + @anoncritical async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: tasks = [] async with self.crawl_bar(path): @@ -580,6 +584,7 @@ class IliasCrawler(HttpCrawler): await asyncio.gather(*tasks) + @anoncritical async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) @@ -601,6 +606,7 @@ class IliasCrawler(HttpCrawler): # TODO: Proper exception raise RuntimeError(f"Unknown type: {element.type!r}") + @arepeat(3) async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) @@ -621,6 +627,7 @@ class IliasCrawler(HttpCrawler): sink.done() + @arepeat(3) async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: @@ -638,19 +645,18 @@ class IliasCrawler(HttpCrawler): sink.done() async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: + # This function will retry itself a few times if it is not logged in - it won't handle + # connection errors if retries_left < 0: # TODO: Proper exception raise RuntimeError("Get page failed too often") print(url) - try: - async with self.session.get(url) as request: - soup = soupify(await request.read()) - if self._is_logged_in(soup): - return soup + async with self.session.get(url) as request: + soup = soupify(await request.read()) + if self._is_logged_in(soup): + return soup - await self._shibboleth_login.login(self.session) - except Exception: - return await self._get_page(url, retries_left - 1) + await self._shibboleth_login.login(self.session) return await self._get_page(url, retries_left - 1) From 5ccb17622e0988a7cc21fe4559041b1b20a92771 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:01:41 +0200 Subject: [PATCH 135/524] Configure pycodestyle to use a max line length of 110 --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 18ff558..4297032 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,3 +14,6 @@ install_requires = [options.entry_points] console_scripts = pferd = PFERD.__main__:main + +[pycodestyle] +max-line-length = 110 \ No newline at end of file From cd5aa618347b43dd9725718782ad9626f3ec4839 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:17:01 +0200 Subject: [PATCH 136/524] Set max line length for pylint --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 4297032..288cd3c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,4 +16,7 @@ console_scripts = pferd = PFERD.__main__:main [pycodestyle] +max-line-length = 110 + +[pylint.FORMAT] max-line-length = 110 \ No newline at end of file From 2b6235dc78386a488c48c4704a061c09e3ca5a0e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:17:12 +0200 Subject: [PATCH 137/524] Fix pylint warnings (and 2 found bugs) in ILIAS crawler --- PFERD/crawlers/ilias.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 3f09789..00bb04b 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -1,13 +1,12 @@ import asyncio import json import re -from configparser import SectionProxy -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import Any, AsyncContextManager, Dict, List, Optional, Set, Union +from typing import Any, Dict, List, Optional, Set, Union from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit) @@ -19,9 +18,7 @@ from PFERD.utils import soupify from ..authenticators import Authenticator from ..conductor import TerminalConductor from ..config import Config -from ..crawler import (Crawler, CrawlerSection, HttpCrawler, anoncritical, - arepeat) -from ..output_dir import FileSink +from ..crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat TargetType = Union[str, int] @@ -285,8 +282,8 @@ class IliasPage: if not element_type: continue - elif element_type == IliasElementType.MEETING: - element_path = _sanitize_path_name(self._normalize_meeting_name(element_name)) + if element_type == IliasElementType.MEETING: + element_name = _sanitize_path_name(self._normalize_meeting_name(element_name)) elif element_type == IliasElementType.FILE: result.append(self._file_to_element(element_name, abs_url, link)) continue @@ -424,9 +421,11 @@ class IliasPage: """ return urljoin(self._page_url, link_tag.get("href")) + german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + def demangle_date(date_str: str) -> Optional[datetime]: """ Demangle a given date in one of the following formats: @@ -463,9 +462,11 @@ def demangle_date(date_str: str) -> Optional[datetime]: print(f"Could not parse date {date_str!r}") return None -def _format_date_english(date: date) -> str: - month = english_months[date.month - 1] - return f"{date.day:02d}. {month} {date.year:04d}" + +def _format_date_english(date_to_format: date) -> str: + month = english_months[date_to_format.month - 1] + return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}" + def _yesterday() -> date: return date.today() - timedelta(days=1) @@ -617,7 +618,7 @@ class IliasCrawler(HttpCrawler): page = IliasPage(await self._get_page(element.url), element.url, element) real_element = page.get_child_elements()[0] - async with dl as sink, self.session.get(element.url) as resp: + async with dl as sink, self.session.get(real_element.url) as resp: if resp.content_length: bar.set_total(resp.content_length) From 467ea3a37eebb56b0cf5ec7c85e43ffd00e6d025 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:26:58 +0200 Subject: [PATCH 138/524] Document ILIAS-Crawler arguments in CONFIG.md --- CONFIG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 53c0706..bd24b16 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -120,6 +120,14 @@ crawler simulate a slower, network-based crawler. download requests. (Default: 0.0) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) +### The `kit-ilias` crawler + +This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor SCC-Server, so you should be nice and use reasonable delays and concurrent requests. +- `target`: The ILIAS element to crawl. Can be: + - `desktop` if you want to crawl your personal desktop + - `` if you want to crawl the course with the given id + - `` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page) +- `tfa_auth`: Like `auth` but only used for two-factor authentication ## Authenticator types ### The `simple` authenticator From 8b76ebb3efb5cf674b6ffa024dc65f4e389fdf88 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 16 May 2021 13:28:06 +0200 Subject: [PATCH 139/524] Rename IliasCrawler to KitIliasCrawler --- PFERD/crawlers/__init__.py | 6 +++--- PFERD/crawlers/ilias.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 0ae2ca3..41733cb 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -5,7 +5,7 @@ from ..authenticator import Authenticator from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler -from .ilias import IliasCrawler, IliasCrawlerSection +from .ilias import KitIliasCrawler, KitIliasCrawlerSection from .local import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -19,6 +19,6 @@ CrawlerConstructor = Callable[[ CRAWLERS: Dict[str, CrawlerConstructor] = { "local": lambda n, s, c, t, a: LocalCrawler(n, LocalCrawlerSection(s), c, t), - "ilias": lambda n, s, c, t, a: - IliasCrawler(n, IliasCrawlerSection(s), c, t, a), + "kit-ilias": lambda n, s, c, t, a: + KitIliasCrawler(n, KitIliasCrawlerSection(s), c, t, a), } diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 00bb04b..edb48a8 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -23,7 +23,7 @@ from ..crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat TargetType = Union[str, int] -class IliasCrawlerSection(CrawlerSection): +class KitIliasCrawlerSection(CrawlerSection): def target(self) -> TargetType: target = self.s.get("target") @@ -510,11 +510,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ ]) -class IliasCrawler(HttpCrawler): +class KitIliasCrawler(HttpCrawler): def __init__( self, name: str, - section: IliasCrawlerSection, + section: KitIliasCrawlerSection, config: Config, conductor: TerminalConductor, authenticators: Dict[str, Authenticator] From 3efec53f51ce46983605225efee70fd10172f0d0 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 16 May 2021 14:31:43 +0200 Subject: [PATCH 140/524] Configure code checking and formatting tools Checking - mypy - flake8 (which uses pyflakes and pycodestyle) Formatting - autopep8 - isort --- DEV.md | 25 ++++++++++++++++++++----- scripts/check | 4 ++++ scripts/format | 4 ++++ scripts/setup | 5 +++++ setup.cfg | 8 ++++---- 5 files changed, 37 insertions(+), 9 deletions(-) create mode 100755 scripts/check create mode 100755 scripts/format create mode 100755 scripts/setup diff --git a/DEV.md b/DEV.md index a679b4a..212cec8 100644 --- a/DEV.md +++ b/DEV.md @@ -15,12 +15,14 @@ environment, run these commands in the same directory as this file: ``` $ python -m venv .venv $ . .venv/bin/activate -$ pip install --editable . +$ ./scripts/setup ``` -After this, you can use PFERD as if it was installed normally. Since PFERD was -installed with `--editable`, there is no need to re-run `pip install` when the -source code is changed. +The setup script installs a few required dependencies and tools. It also +installs PFERD via `pip install --editable .`, which means that you can just run +`pferd` as if it was installed normally. Since PFERD was installed with +`--editable`, there is no need to re-run `pip install` when the source code is +changed. For more details, see [this part of the Python Tutorial][venv-tut] and [this section on "development mode"][ppug-dev]. @@ -29,9 +31,22 @@ For more details, see [this part of the Python Tutorial][venv-tut] and [venv-tut]: "12. Virtual Environments and Packages" [ppug-dev]: "Working in “development mode”" +## Checking and formatting the code + +To run a set of checks against the code, run `./scripts/check` in the repo's +root directory. This script will run a few tools installed by `./scripts/setup` +against the entire project. + +To format the code, run `./scripts/format` in the repo's root directory. + +Before committing changes, please make sure the checks return no warnings and +the code is formatted. + ## Contributing When submitting a PR that adds, changes or modifies a feature, please ensure -that the corresponding documentation is updated. +that the corresponding documentation is updated as well. Also, please ensure +that `./scripts/check` returns no warnings and the code has been run through +`./scripts/format`. In your first PR, please add your name to the `LICENSE` file. diff --git a/scripts/check b/scripts/check new file mode 100755 index 0000000..ba767cd --- /dev/null +++ b/scripts/check @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +mypy PFERD +flake8 PFERD diff --git a/scripts/format b/scripts/format new file mode 100755 index 0000000..cc196ae --- /dev/null +++ b/scripts/format @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +autopep8 --recursive --in-place PFERD +isort PFERD diff --git a/scripts/setup b/scripts/setup new file mode 100755 index 0000000..8a5399b --- /dev/null +++ b/scripts/setup @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +pip install --upgrade pip setuptools +pip install --editable . +pip install --upgrade mypy flake8 autopep8 isort diff --git a/setup.cfg b/setup.cfg index 288cd3c..f6b64ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,8 +15,8 @@ install_requires = console_scripts = pferd = PFERD.__main__:main -[pycodestyle] -max-line-length = 110 +[flake8] +max_line_length = 110 -[pylint.FORMAT] -max-line-length = 110 \ No newline at end of file +[isort] +line_length = 110 From 0bae0091896ab3b0d7c1d46d0cf333f8e31ecbea Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 16 May 2021 14:32:53 +0200 Subject: [PATCH 141/524] Run formatting tools --- PFERD/crawler.py | 3 +-- PFERD/crawlers/ilias.py | 4 ++-- PFERD/output_dir.py | 3 +-- PFERD/utils.py | 2 ++ 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 140ae20..cb31223 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -3,8 +3,7 @@ from contextlib import asynccontextmanager from datetime import datetime from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import (Any, AsyncContextManager, AsyncIterator, Awaitable, - Callable, Dict, Optional, TypeVar) +from typing import Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Dict, Optional, TypeVar import aiohttp from rich.markup import escape diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index edb48a8..f2a7656 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -7,11 +7,11 @@ from enum import Enum from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import Any, Dict, List, Optional, Set, Union -from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, - urlunsplit) +from urllib.parse import parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit import aiohttp from bs4 import BeautifulSoup, Tag + from PFERD.output_dir import Redownload from PFERD.utils import soupify diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 23d4a31..ae69d10 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -9,8 +9,7 @@ from datetime import datetime from enum import Enum from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import (AsyncContextManager, AsyncIterator, BinaryIO, Iterator, - Optional) +from typing import AsyncContextManager, AsyncIterator, BinaryIO, Iterator, Optional from rich.markup import escape diff --git a/PFERD/utils.py b/PFERD/utils.py index d7c61ec..3022ab6 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -25,6 +25,7 @@ async def ainput(prompt: str) -> str: async def agetpass(prompt: str) -> str: return await to_thread(lambda: getpass.getpass(prompt)) + def soupify(data: bytes) -> bs4.BeautifulSoup: """ Parses HTML to a beautifulsoup object. @@ -32,6 +33,7 @@ def soupify(data: bytes) -> bs4.BeautifulSoup: return bs4.BeautifulSoup(data, "html.parser") + async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: """ Asks the user a yes/no question and returns their choice. From b8efcc2ca5309fc3d3da6b89fa21761371d4114e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 17 May 2021 21:30:26 +0200 Subject: [PATCH 142/524] Respect filters in ILIAS crawler --- PFERD/crawlers/ilias.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index f2a7656..09bad09 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -589,6 +589,9 @@ class KitIliasCrawler(HttpCrawler): async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) + if not self.should_crawl(element_path): + return + if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: From db1219d4a9cd8bb0522803c84e7f1e6203a6b262 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 17 May 2021 21:31:22 +0200 Subject: [PATCH 143/524] Create a link file in ILIAS crawler This allows us to crawl links and represent them in the file system. Users can choose between an ILIAS-imitation (that optionally auto-redirects) and a plain text variant. --- CONFIG.md | 6 ++ PFERD/crawlers/ilias.py | 139 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 4 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index bd24b16..6149ef5 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -128,6 +128,12 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `` if you want to crawl the course with the given id - `` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page) - `tfa_auth`: Like `auth` but only used for two-factor authentication +- `link_file_redirect_delay`: PFERD will create local HTML for external links. + If this property is set to a non-negative value it configures the amount of seconds after which the local HTML + file will redirect you to the link target. +- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link + target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional + HTML page instead. ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 09bad09..4d81976 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -52,6 +52,12 @@ class KitIliasCrawlerSection(CrawlerSection): self.invalid_value("auth", value, "No such auth section exists") return auth + def link_file_redirect_delay(self) -> int: + return self.s.getint("link_file_redirect_delay", fallback=-1) + + def link_file_use_plaintext(self) -> bool: + return self.s.getboolean("link_file_plain_text", fallback=False) + class IliasElementType(Enum): EXERCISE = "exercise" @@ -72,6 +78,7 @@ class IliasPageElement: url: str name: str mtime: Optional[datetime] = None + description: Optional[str] = None class IliasPage: @@ -279,6 +286,7 @@ class IliasPage: abs_url = self._abs_url_from_link(link) element_name = _sanitize_path_name(link.getText()) element_type = self._find_type_from_link(element_name, link, abs_url) + description = self._find_link_description(link) if not element_type: continue @@ -288,10 +296,19 @@ class IliasPage: result.append(self._file_to_element(element_name, abs_url, link)) continue - result.append(IliasPageElement(element_type, abs_url, element_name, None)) + result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) return result + def _find_link_description(self, link: Tag) -> Optional[str]: + tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) + if not tile: + return None + description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) + if not description_element: + return None + return description_element.getText().strip() + def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: # Files have a list of properties (type, modification date, size, etc.) # In a series of divs. @@ -528,6 +545,8 @@ class KitIliasCrawler(HttpCrawler): self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() + self._link_file_redirect_delay = section.link_file_redirect_delay() + self._link_file_use_plaintext = section.link_file_use_plaintext() async def crawl(self) -> None: if isinstance(self._target, int): @@ -598,8 +617,7 @@ class KitIliasCrawler(HttpCrawler): # TODO: Delete self.print(f"Skipping forum [green]{element_path}[/]") elif element.type == IliasElementType.LINK: - # TODO: Write in meta-redirect file - self.print(f"Skipping link [green]{element_path}[/]") + await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: await self._download_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: @@ -610,6 +628,30 @@ class KitIliasCrawler(HttpCrawler): # TODO: Proper exception raise RuntimeError(f"Unknown type: {element.type!r}") + @arepeat(3) + async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + dl = await self.download(element_path, mtime=element.mtime) + if not dl: + return + + async with self.download_bar(element_path, 2) as bar: + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") + async with self.session.get(export_url) as response: + html_page: BeautifulSoup = soupify(await response.read()) + real_url: str = html_page.select_one("a").get("href").strip() + + bar.advance(1) + + async with dl as sink: + content = _link_template_plain if self._link_file_use_plaintext else _link_template_rich + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + bar.advance(1) + sink.done() + @arepeat(3) async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky @@ -654,7 +696,7 @@ class KitIliasCrawler(HttpCrawler): if retries_left < 0: # TODO: Proper exception raise RuntimeError("Get page failed too often") - print(url) + print(url, "retries left", retries_left) async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): @@ -792,3 +834,92 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) + +_link_template_plain = "{{link}}" +# flake8: noqa E501 +_link_template_rich = """ + + + + + ILIAS - Link: {{ name}} + + + + + +
+ +
+ +
{{description}}
+
+ +
+ + +""" From 1525aa15a6dd9f09d70af1e1f994ed03fb6cf5db Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 18 May 2021 22:40:28 +0200 Subject: [PATCH 144/524] Fix link template error and use indeterminate progress bar --- PFERD/crawlers/ilias.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 4d81976..014f231 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -634,14 +634,12 @@ class KitIliasCrawler(HttpCrawler): if not dl: return - async with self.download_bar(element_path, 2) as bar: + async with self.download_bar(element_path): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") async with self.session.get(export_url) as response: html_page: BeautifulSoup = soupify(await response.read()) real_url: str = html_page.select_one("a").get("href").strip() - bar.advance(1) - async with dl as sink: content = _link_template_plain if self._link_file_use_plaintext else _link_template_rich content = content.replace("{{link}}", real_url) @@ -649,7 +647,6 @@ class KitIliasCrawler(HttpCrawler): content = content.replace("{{description}}", str(element.description)) content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) sink.file.write(content.encode("utf-8")) - bar.advance(1) sink.done() @arepeat(3) @@ -842,7 +839,7 @@ _link_template_rich = """ - ILIAS - Link: {{ name}} + ILIAS - Link: {{name}} From 4b68fa771fb89dac8615cca1fec09c4743893342 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 18 May 2021 22:43:46 +0200 Subject: [PATCH 145/524] Move logging logic to singleton - Renamed module and class because "conductor" didn't make a lot of sense - Used singleton approach (there's only one stdout after all) - Redesigned progress bars (now with download speed!) --- PFERD/authenticator.py | 3 - PFERD/authenticators/__init__.py | 10 +- PFERD/authenticators/simple.py | 7 +- PFERD/authenticators/tfa.py | 7 +- PFERD/conductor.py | 95 ------------------ PFERD/crawler.py | 48 ++-------- PFERD/crawlers/__init__.py | 10 +- PFERD/crawlers/ilias.py | 6 +- PFERD/crawlers/local.py | 4 +- PFERD/logging.py | 160 +++++++++++++++++++++++++++++++ PFERD/output_dir.py | 21 ++-- PFERD/pferd.py | 17 +--- 12 files changed, 195 insertions(+), 193 deletions(-) delete mode 100644 PFERD/conductor.py create mode 100644 PFERD/logging.py diff --git a/PFERD/authenticator.py b/PFERD/authenticator.py index 7475e2a..d67b263 100644 --- a/PFERD/authenticator.py +++ b/PFERD/authenticator.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from typing import Tuple -from .conductor import TerminalConductor from .config import Config, Section @@ -23,7 +22,6 @@ class Authenticator(ABC): name: str, section: AuthSection, config: Config, - conductor: TerminalConductor, ) -> None: """ Initialize an authenticator from its name and its section in the config @@ -36,7 +34,6 @@ class Authenticator(ABC): """ self.name = name - self.conductor = conductor @abstractmethod async def credentials(self) -> Tuple[str, str]: diff --git a/PFERD/authenticators/__init__.py b/PFERD/authenticators/__init__.py index 97ff03a..35096cf 100644 --- a/PFERD/authenticators/__init__.py +++ b/PFERD/authenticators/__init__.py @@ -2,7 +2,6 @@ from configparser import SectionProxy from typing import Callable, Dict from ..authenticator import Authenticator, AuthSection -from ..conductor import TerminalConductor from ..config import Config from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -11,12 +10,11 @@ AuthConstructor = Callable[[ str, # Name (without the "auth:" prefix) SectionProxy, # Authenticator's section of global config Config, # Global config - TerminalConductor, # Global conductor instance ], Authenticator] AUTHENTICATORS: Dict[str, AuthConstructor] = { - "simple": lambda n, s, c, t: - SimpleAuthenticator(n, SimpleAuthSection(s), c, t), - "tfa": lambda n, s, c, t: - TfaAuthenticator(n, AuthSection(s), c, t), + "simple": lambda n, s, c: + SimpleAuthenticator(n, SimpleAuthSection(s), c), + "tfa": lambda n, s, c: + TfaAuthenticator(n, AuthSection(s), c), } diff --git a/PFERD/authenticators/simple.py b/PFERD/authenticators/simple.py index f21661c..caa0002 100644 --- a/PFERD/authenticators/simple.py +++ b/PFERD/authenticators/simple.py @@ -1,8 +1,8 @@ from typing import Optional, Tuple from ..authenticator import Authenticator, AuthException, AuthSection -from ..conductor import TerminalConductor from ..config import Config +from ..logging import log from ..utils import agetpass, ainput @@ -20,9 +20,8 @@ class SimpleAuthenticator(Authenticator): name: str, section: SimpleAuthSection, config: Config, - conductor: TerminalConductor, ) -> None: - super().__init__(name, section, config, conductor) + super().__init__(name, section, config) self._username = section.username() self._password = section.password() @@ -34,7 +33,7 @@ class SimpleAuthenticator(Authenticator): if self._username is not None and self._password is not None: return self._username, self._password - async with self.conductor.exclusive_output(): + async with log.exclusive_output(): if self._username is None: self._username = await ainput("Username: ") else: diff --git a/PFERD/authenticators/tfa.py b/PFERD/authenticators/tfa.py index 3513d09..b0eef18 100644 --- a/PFERD/authenticators/tfa.py +++ b/PFERD/authenticators/tfa.py @@ -1,8 +1,8 @@ from typing import Tuple from ..authenticator import Authenticator, AuthException, AuthSection -from ..conductor import TerminalConductor from ..config import Config +from ..logging import log from ..utils import ainput @@ -12,15 +12,14 @@ class TfaAuthenticator(Authenticator): name: str, section: AuthSection, config: Config, - conductor: TerminalConductor, ) -> None: - super().__init__(name, section, config, conductor) + super().__init__(name, section, config) async def username(self) -> str: raise AuthException("TFA authenticator does not support usernames") async def password(self) -> str: - async with self.conductor.exclusive_output(): + async with log.exclusive_output(): code = await ainput("TFA code: ") return code diff --git a/PFERD/conductor.py b/PFERD/conductor.py deleted file mode 100644 index d50574e..0000000 --- a/PFERD/conductor.py +++ /dev/null @@ -1,95 +0,0 @@ -import asyncio -from contextlib import asynccontextmanager, contextmanager -from types import TracebackType -from typing import AsyncIterator, Iterator, List, Optional, Type - -from rich.console import Console -from rich.progress import Progress, TaskID - - -class ProgressBar: - def __init__(self, progress: Progress, taskid: TaskID): - self._progress = progress - self._taskid = taskid - - def advance(self, amount: float = 1) -> None: - self._progress.advance(self._taskid, advance=amount) - - def set_total(self, total: float) -> None: - self._progress.update(self._taskid, total=total) - self._progress.start_task(self._taskid) - - -class TerminalConductor: - def __init__(self) -> None: - self._stopped = False - self._lock = asyncio.Lock() - self._lines: List[str] = [] - - self._console = Console(highlight=False) - self._progress = Progress(console=self._console) - - async def _start(self) -> None: - for task in self._progress.tasks: - task.visible = True - self._progress.start() - - self._stopped = False - - for line in self._lines: - self.print(line) - self._lines = [] - - async def _stop(self) -> None: - self._stopped = True - - for task in self._progress.tasks: - task.visible = False - self._progress.stop() - - async def __aenter__(self) -> None: - async with self._lock: - await self._start() - - async def __aexit__( - self, - exc_type: Optional[Type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], - ) -> Optional[bool]: - async with self._lock: - await self._stop() - return None - - def print(self, line: str) -> None: - if self._stopped: - self._lines.append(line) - else: - self._console.print(line) - - @asynccontextmanager - async def exclusive_output(self) -> AsyncIterator[None]: - async with self._lock: - await self._stop() - try: - yield - finally: - await self._start() - - @contextmanager - def progress_bar( - self, - description: str, - total: Optional[float] = None, - ) -> Iterator[ProgressBar]: - if total is None: - # Indeterminate progress bar - taskid = self._progress.add_task(description, start=False) - else: - taskid = self._progress.add_task(description, total=total) - - bar = ProgressBar(self._progress, taskid) - try: - yield bar - finally: - self._progress.remove_task(taskid) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index cb31223..677baa2 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -9,9 +9,9 @@ import aiohttp from rich.markup import escape from .authenticator import Authenticator -from .conductor import ProgressBar, TerminalConductor from .config import Config, Section from .limiter import Limiter +from .logging import ProgressBar, log from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload from .transformer import RuleParseException, Transformer from .version import __version__ @@ -36,7 +36,7 @@ def noncritical(f: Wrapped) -> Wrapped: try: f(self, *args, **kwargs) except Exception as e: - self.print(f"[red]Something went wrong: {escape(str(e))}") + log.print(f"[red]Something went wrong: {escape(str(e))}") self.error_free = False return wrapper # type: ignore @@ -79,7 +79,7 @@ def anoncritical(f: AWrapped) -> AWrapped: try: await f(self, *args, **kwargs) except Exception as e: - self.print(f"[red]Something went wrong: {escape(str(e))}") + log.print(f"[red]Something went wrong: {escape(str(e))}") self.error_free = False return wrapper # type: ignore @@ -182,7 +182,6 @@ class Crawler(ABC): name: str, section: CrawlerSection, config: Config, - conductor: TerminalConductor, ) -> None: """ Initialize a crawler from its name and its section in the config file. @@ -194,7 +193,6 @@ class Crawler(ABC): """ self.name = name - self._conductor = conductor self.error_free = True self._limiter = Limiter( @@ -213,34 +211,8 @@ class Crawler(ABC): config.working_dir / section.output_dir(name), section.redownload(), section.on_conflict(), - self._conductor, ) - def print(self, text: str) -> None: - """ - Print rich markup to the terminal. Crawlers *must* use this function to - print things unless they are holding an exclusive output context - manager! Be careful to escape all user-supplied strings. - """ - - self._conductor.print(text) - - def exclusive_output(self) -> AsyncContextManager[None]: - """ - Acquire exclusive rights™ to the terminal output. While this context - manager is held, output such as printing and progress bars from other - threads is suspended and the current thread may do whatever it wants - with the terminal. However, it must return the terminal to its original - state before exiting the context manager. - - No two threads can hold this context manager at the same time. - - Useful for password or confirmation prompts as well as running other - programs while crawling (e. g. to get certain credentials). - """ - - return self._conductor.exclusive_output() - @asynccontextmanager async def crawl_bar( self, @@ -249,7 +221,7 @@ class Crawler(ABC): ) -> AsyncIterator[ProgressBar]: desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" async with self._limiter.limit_crawl(): - with self._conductor.progress_bar(desc, total=total) as bar: + with log.crawl_bar(desc, total=total) as bar: yield bar @asynccontextmanager @@ -260,7 +232,7 @@ class Crawler(ABC): ) -> AsyncIterator[ProgressBar]: desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" async with self._limiter.limit_download(): - with self._conductor.progress_bar(desc, total=total) as bar: + with log.download_bar(desc, total=total) as bar: yield bar def should_crawl(self, path: PurePath) -> bool: @@ -289,7 +261,7 @@ class Crawler(ABC): crawler. """ - async with self._conductor: + with log.show_progress(): await self.crawl() @abstractmethod @@ -312,9 +284,8 @@ class HttpCrawler(Crawler): name: str, section: CrawlerSection, config: Config, - conductor: TerminalConductor, ) -> None: - super().__init__(name, section, config, conductor) + super().__init__(name, section, config) self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._output_dir.register_reserved(self.COOKIE_FILE) @@ -340,7 +311,4 @@ class HttpCrawler(Crawler): try: cookie_jar.save(self._cookie_jar_path) except Exception: - self.print( - "[bold red]Warning:[/] Failed to save cookies to " - + escape(str(self.COOKIE_FILE)) - ) + log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 41733cb..72d6798 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -2,7 +2,6 @@ from configparser import SectionProxy from typing import Callable, Dict from ..authenticator import Authenticator -from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler from .ilias import KitIliasCrawler, KitIliasCrawlerSection @@ -12,13 +11,12 @@ CrawlerConstructor = Callable[[ str, # Name (without the "crawl:" prefix) SectionProxy, # Crawler's section of global config Config, # Global config - TerminalConductor, # Global conductor instance Dict[str, Authenticator], # Loaded authenticators by name ], Crawler] CRAWLERS: Dict[str, CrawlerConstructor] = { - "local": lambda n, s, c, t, a: - LocalCrawler(n, LocalCrawlerSection(s), c, t), - "kit-ilias": lambda n, s, c, t, a: - KitIliasCrawler(n, KitIliasCrawlerSection(s), c, t, a), + "local": lambda n, s, c, a: + LocalCrawler(n, LocalCrawlerSection(s), c), + "kit-ilias": lambda n, s, c, a: + KitIliasCrawler(n, KitIliasCrawlerSection(s), c, a), } diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index 014f231..beac208 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -16,7 +16,6 @@ from PFERD.output_dir import Redownload from PFERD.utils import soupify from ..authenticators import Authenticator -from ..conductor import TerminalConductor from ..config import Config from ..crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat @@ -533,10 +532,9 @@ class KitIliasCrawler(HttpCrawler): name: str, section: KitIliasCrawlerSection, config: Config, - conductor: TerminalConductor, authenticators: Dict[str, Authenticator] ): - super().__init__(name, section, config, conductor) + super().__init__(name, section, config) self._shibboleth_login = KitShibbolethLogin( section.auth(authenticators), @@ -615,7 +613,7 @@ class KitIliasCrawler(HttpCrawler): await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: # TODO: Delete - self.print(f"Skipping forum [green]{element_path}[/]") + print(f"Skipping forum [green]{element_path}[/]") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 2dde0d4..363107f 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -4,7 +4,6 @@ import random from pathlib import Path, PurePath from typing import Optional -from ..conductor import TerminalConductor from ..config import Config from ..crawler import Crawler, CrawlerSection, anoncritical @@ -44,9 +43,8 @@ class LocalCrawler(Crawler): name: str, section: LocalCrawlerSection, config: Config, - conductor: TerminalConductor, ): - super().__init__(name, section, config, conductor) + super().__init__(name, section, config) self._target = config.working_dir / section.target() self._crawl_delay = section.crawl_delay() diff --git a/PFERD/logging.py b/PFERD/logging.py new file mode 100644 index 0000000..b075d35 --- /dev/null +++ b/PFERD/logging.py @@ -0,0 +1,160 @@ +import asyncio +from contextlib import asynccontextmanager, contextmanager +# TODO In Python 3.9 and above, ContextManager and AsyncContextManager are deprecated +from typing import AsyncIterator, ContextManager, Iterator, List, Optional + +from rich.console import Console, RenderGroup +from rich.live import Live +from rich.markup import escape +from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, + TransferSpeedColumn) +from rich.table import Column + + +class ProgressBar: + def __init__(self, progress: Progress, taskid: TaskID): + self._progress = progress + self._taskid = taskid + + def advance(self, amount: float = 1) -> None: + self._progress.advance(self._taskid, advance=amount) + + def set_total(self, total: float) -> None: + self._progress.update(self._taskid, total=total) + self._progress.start_task(self._taskid) + + +class Log: + def __init__(self) -> None: + self.console = Console(highlight=False) + + self._crawl_progress = Progress( + TextColumn("{task.description}", table_column=Column(ratio=1)), + BarColumn(), + TimeRemainingColumn(), + expand=True, + ) + self._download_progress = Progress( + TextColumn("{task.description}", table_column=Column(ratio=1)), + TransferSpeedColumn(), + DownloadColumn(), + BarColumn(), + TimeRemainingColumn(), + expand=True, + ) + + self._live = Live(console=self.console, transient=True) + self._update_live() + + self._showing_progress = False + self._progress_suspended = False + self._lock = asyncio.Lock() + self._lines: List[str] = [] + + # Whether different parts of the output are enabled or disabled + self._enabled_explain = False + self._enabled_action = True + self._enabled_report = True + + def _update_live(self) -> None: + elements = [] + if self._crawl_progress.task_ids: + elements.append(self._crawl_progress) + if self._download_progress.task_ids: + elements.append(self._download_progress) + + group = RenderGroup(*elements) # type: ignore + self._live.update(group) + + def configure(self, explain: bool, action: bool, report: bool) -> None: + self._enabled_explain = explain + self._enabled_action = action + self._enabled_report = report + + @contextmanager + def show_progress(self) -> Iterator[None]: + if self._showing_progress: + raise RuntimeError("Calling 'show_progress' while already showing progress") + + self._showing_progress = True + try: + with self._live: + yield + finally: + self._showing_progress = False + + @asynccontextmanager + async def exclusive_output(self) -> AsyncIterator[None]: + if not self._showing_progress: + raise RuntimeError("Calling 'exclusive_output' while not showing progress") + + async with self._lock: + self._progress_suspended = True + self._live.stop() + try: + yield + finally: + self._live.start() + self._progress_suspended = False + for line in self._lines: + self.print(line) + self._lines = [] + + def print(self, text: str) -> None: + if self._progress_suspended: + self._lines.append(text) + else: + self.console.print(text) + + def explain_topic(self, text: str) -> None: + if self._enabled_explain: + self.print(f"[cyan]{escape(text)}") + + def explain(self, text: str) -> None: + if self._enabled_explain: + self.print(f" {escape(text)}") + + def action(self, text: str) -> None: + if self._enabled_action: + self.print(text) + + def report(self, text: str) -> None: + if self._enabled_report: + self.print(text) + + @contextmanager + def _bar( + self, + progress: Progress, + description: str, + total: Optional[float], + ) -> Iterator[ProgressBar]: + if total is None: + # Indeterminate progress bar + taskid = progress.add_task(description, start=False) + else: + taskid = progress.add_task(description, total=total) + self._update_live() + + try: + yield ProgressBar(progress, taskid) + finally: + progress.remove_task(taskid) + self._update_live() + + def crawl_bar( + self, + description: str, + total: Optional[float] = None, + ) -> ContextManager[ProgressBar]: + return self._bar(self._crawl_progress, description, total) + + def download_bar( + self, + description: str, + total: Optional[float] = None, + ) -> ContextManager[ProgressBar]: + return self._bar(self._download_progress, description, total) + + +log = Log() diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index ae69d10..417fa52 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -13,7 +13,7 @@ from typing import AsyncContextManager, AsyncIterator, BinaryIO, Iterator, Optio from rich.markup import escape -from .conductor import TerminalConductor +from .logging import log from .report import MarkConflictException, MarkDuplicateException, Report from .utils import prompt_yes_no @@ -93,12 +93,10 @@ class OutputDirectory: root: Path, redownload: Redownload, on_conflict: OnConflict, - conductor: TerminalConductor, ): self._root = root self._redownload = redownload self._on_conflict = on_conflict - self._conductor = conductor self._report = Report() @@ -176,7 +174,7 @@ class OutputDirectory: path: PurePath, ) -> bool: if on_conflict == OnConflict.PROMPT: - async with self._conductor.exclusive_output(): + async with log.exclusive_output(): prompt = f"Replace {path} with remote file?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: @@ -195,7 +193,7 @@ class OutputDirectory: path: PurePath, ) -> bool: if on_conflict == OnConflict.PROMPT: - async with self._conductor.exclusive_output(): + async with log.exclusive_output(): prompt = f"Recursively delete {path} and replace with remote file?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: @@ -215,7 +213,7 @@ class OutputDirectory: parent: PurePath, ) -> bool: if on_conflict == OnConflict.PROMPT: - async with self._conductor.exclusive_output(): + async with log.exclusive_output(): prompt = f"Delete {parent} so remote file {path} can be downloaded?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: @@ -234,7 +232,7 @@ class OutputDirectory: path: PurePath, ) -> bool: if on_conflict == OnConflict.PROMPT: - async with self._conductor.exclusive_output(): + async with log.exclusive_output(): prompt = f"Delete {path}?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: @@ -356,12 +354,10 @@ class OutputDirectory: self._update_metadata(info) if changed: - self._conductor.print( - f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") + log.action(f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") self._report.change_file(info.path) else: - self._conductor.print( - f"[bold bright_green]Added[/] {escape(str(info.path))}") + log.action(f"[bold bright_green]Added[/] {escape(str(info.path))}") self._report.add_file(info.path) async def cleanup(self) -> None: @@ -390,8 +386,7 @@ class OutputDirectory: if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() - self._conductor.print( - f"[bold bright_magenta]Deleted[/] {escape(str(path))}") + log.action(f"[bold bright_magenta]Deleted[/] {escape(str(path))}") self._report.delete_file(pure) except OSError: pass diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 9154a80..10cd1c2 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -5,7 +5,6 @@ from rich.markup import escape from .authenticator import Authenticator from .authenticators import AUTHENTICATORS -from .conductor import TerminalConductor from .config import Config from .crawler import Crawler from .crawlers import CRAWLERS @@ -18,7 +17,6 @@ class PferdLoadException(Exception): class Pferd: def __init__(self, config: Config): self._config = config - self._conductor = TerminalConductor() self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} @@ -34,12 +32,7 @@ class Pferd: print(f"[red]Error: Unknown authenticator type {t}") continue - authenticator = authenticator_constructor( - name, - section, - self._config, - self._conductor, - ) + authenticator = authenticator_constructor(name, section, self._config) self._authenticators[name] = authenticator if abort: @@ -57,13 +50,7 @@ class Pferd: print(f"[red]Error: Unknown crawler type {t}") continue - crawler = crawler_constructor( - name, - section, - self._config, - self._conductor, - self._authenticators, - ) + crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler if abort: From 38510655007c83b72ef9ff87e4d0640aae87e88c Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 18 May 2021 23:23:40 +0200 Subject: [PATCH 146/524] Fix local crawler's download bars Display the pure path instead of the local path. --- PFERD/crawlers/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 363107f..d4156bc 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -91,7 +91,7 @@ class LocalCrawler(Crawler): if not dl: return - async with self.download_bar(path) as bar: + async with self.download_bar(pure) as bar: await asyncio.sleep(random.uniform( 0.5 * self._download_delay, self._download_delay, From b7a999bc2ea813325fa331c83a862b16aaef46a9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 13:25:57 +0200 Subject: [PATCH 147/524] Clean up crawler exceptions and (a)noncritical --- PFERD/crawler.py | 136 +++++++++++++++++++++++++---------------------- 1 file changed, 71 insertions(+), 65 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 677baa2..96745d1 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -13,11 +13,15 @@ from .config import Config, Section from .limiter import Limiter from .logging import ProgressBar, log from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload -from .transformer import RuleParseException, Transformer +from .transformer import Transformer from .version import __version__ -class CrawlerLoadException(Exception): +class CrawlWarning(Exception): + pass + + +class CrawlError(Exception): pass @@ -26,41 +30,29 @@ Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) def noncritical(f: Wrapped) -> Wrapped: """ - Warning: Must only be applied to member functions of the Crawler class! - Catches all exceptions occuring during the function call. If an exception occurs, the crawler's error_free variable is set to False. - """ - def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - try: - f(self, *args, **kwargs) - except Exception as e: - log.print(f"[red]Something went wrong: {escape(str(e))}") - self.error_free = False - return wrapper # type: ignore - - -def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: - """ Warning: Must only be applied to member functions of the Crawler class! - - If an exception occurs during the function call, retries the function call - a set amount of times. Exceptions that occur during the last attempt are - not caught and instead passed on upwards. """ - def decorator(f: Wrapped) -> Wrapped: - def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): - try: - f(self, *args, **kwargs) - return - except Exception: - pass - f(self, *args, **kwargs) - return wrapper # type: ignore - return decorator + def wrapper(*args: Any, **kwargs: Any) -> None: + if not (args and isinstance(args[0], Crawler)): + raise RuntimeError("@noncritical must only applied to Crawler methods") + + crawler = args[0] + + try: + f(*args, **kwargs) + except CrawlWarning as e: + log.print(f"[bold bright_red]Warning[/] {escape(str(e))}") + crawler.error_free = False + except CrawlError as e: + log.print(f"[bold bright_red]Error[/] [red]{escape(str(e))}") + crawler.error_free = False + raise + + return wrapper # type: ignore AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) @@ -69,42 +61,30 @@ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) def anoncritical(f: AWrapped) -> AWrapped: """ An async version of @noncritical. - Warning: Must only be applied to member functions of the Crawler class! Catches all exceptions occuring during the function call. If an exception occurs, the crawler's error_free variable is set to False. - """ - async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - try: - await f(self, *args, **kwargs) - except Exception as e: - log.print(f"[red]Something went wrong: {escape(str(e))}") - self.error_free = False - return wrapper # type: ignore - - -def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: - """ - An async version of @noncritical. Warning: Must only be applied to member functions of the Crawler class! - - If an exception occurs during the function call, retries the function call - a set amount of times. Exceptions that occur during the last attempt are - not caught and instead passed on upwards. """ - def decorator(f: AWrapped) -> AWrapped: - async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): - try: - await f(self, *args, **kwargs) - return - except Exception: - pass - await f(self, *args, **kwargs) - return wrapper # type: ignore - return decorator + async def wrapper(*args: Any, **kwargs: Any) -> None: + if not (args and isinstance(args[0], Crawler)): + raise RuntimeError("@anoncritical must only applied to Crawler methods") + + crawler = args[0] + + try: + await f(*args, **kwargs) + except CrawlWarning as e: + log.print(f"[bold bright_red]Warning[/] {escape(str(e))}") + crawler.error_free = False + except CrawlError as e: + log.print(f"[bold bright_red]Error[/] [red]{escape(str(e))}") + crawler.error_free = False + raise + + return wrapper # type: ignore class CrawlerSection(Section): @@ -201,11 +181,7 @@ class Crawler(ABC): task_delay=section.delay_between_tasks(), ) - try: - self._transformer = Transformer(section.transform()) - except RuleParseException as e: - e.pretty_print() - raise CrawlerLoadException() + self._transformer = Transformer(section.transform()) self._output_dir = OutputDirectory( config.working_dir / section.output_dir(name), @@ -312,3 +288,33 @@ class HttpCrawler(Crawler): cookie_jar.save(self._cookie_jar_path) except Exception: log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") + + +def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: + """Deprecated.""" + def decorator(f: Wrapped) -> Wrapped: + def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + for _ in range(attempts - 1): + try: + f(self, *args, **kwargs) + return + except Exception: + pass + f(self, *args, **kwargs) + return wrapper # type: ignore + return decorator + + +def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: + """Deprecated.""" + def decorator(f: AWrapped) -> AWrapped: + async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: + for _ in range(attempts - 1): + try: + await f(self, *args, **kwargs) + return + except Exception: + pass + await f(self, *args, **kwargs) + return wrapper # type: ignore + return decorator From a7c025fd866132a7c5fd87684c2e56b951b1460e Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 17:16:23 +0200 Subject: [PATCH 148/524] Implement reusable FileSinkToken for OutputDirectory --- PFERD/output_dir.py | 102 +++++++++++++++++++++++++++++--------------- PFERD/utils.py | 45 ++++++++++++++++++- 2 files changed, 112 insertions(+), 35 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 417fa52..783d6bc 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -3,19 +3,19 @@ import os import random import shutil import string -from contextlib import asynccontextmanager, contextmanager +from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum from pathlib import Path, PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import AsyncContextManager, AsyncIterator, BinaryIO, Iterator, Optional +from typing import AsyncContextManager, BinaryIO, Iterator, Optional, Tuple from rich.markup import escape from .logging import log from .report import MarkConflictException, MarkDuplicateException, Report -from .utils import prompt_yes_no +from .utils import ReusableAsyncContextManager, prompt_yes_no SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_LENGTH = 6 @@ -87,6 +87,49 @@ class DownloadInfo: success: bool = False +class FileSinkToken(ReusableAsyncContextManager[FileSink]): + # Whenever this class is entered, it creates a new temporary file and + # returns a corresponding FileSink. + # + # When it is exited again, the file is closed and information about the + # download handed back to the OutputDirectory. + + def __init__( + self, + output_dir: "OutputDirectory", + path: PurePath, + local_path: Path, + heuristics: Heuristics, + on_conflict: OnConflict, + ): + super().__init__() + + self._output_dir = output_dir + self._path = path + self._local_path = local_path + self._heuristics = heuristics + self._on_conflict = on_conflict + + async def _on_aenter(self) -> FileSink: + tmp_path, file = await self._output_dir._create_tmp_file(self._local_path) + sink = FileSink(file) + + async def after_download() -> None: + await self._output_dir._after_download(DownloadInfo( + self._path, + self._local_path, + tmp_path, + self._heuristics, + self._on_conflict, + sink.is_done(), + )) + + self._stack.push_async_callback(after_download) + self._stack.enter_context(file) + + return sink + + class OutputDirectory: def __init__( self, @@ -111,11 +154,9 @@ class OutputDirectory: try: self._report.mark(path) except MarkDuplicateException: - msg = "Another file has already been placed here." - raise OutputDirException(msg) + raise OutputDirException("Another file has already been placed here.") except MarkConflictException as e: - msg = f"Collides with other file: {e.collides_with}" - raise OutputDirException(msg) + raise OutputDirException(f"Collides with other file: {e.collides_with}") def resolve(self, path: PurePath) -> Path: """ @@ -123,8 +164,7 @@ class OutputDirectory: """ if ".." in path.parts: - msg = f"Path {path} contains forbidden '..'" - raise OutputDirException(msg) + raise OutputDirException(f"Path {path} contains forbidden '..'") return self._root / path def _should_download( @@ -137,6 +177,7 @@ class OutputDirectory: # since we know that the remote is different from the local files. This # includes the case where no local file exists. if not local_path.is_file(): + # TODO Don't download if on_conflict is LOCAL_FIRST or NO_DELETE return True if redownload == Redownload.NEVER: @@ -251,19 +292,24 @@ class OutputDirectory: name = f"{prefix}{base.name}.tmp.{suffix}" return base.parent / name - @asynccontextmanager - async def _sink_context_manager( + async def _create_tmp_file( self, - file: BinaryIO, - info: DownloadInfo, - ) -> AsyncIterator[FileSink]: - sink = FileSink(file) - try: - with file: - yield sink - finally: - info.success = sink.is_done() - await self._after_download(info) + local_path: Path, + ) -> Tuple[Path, BinaryIO]: + """ + May raise an OutputDirException. + """ + + # Create tmp file + for attempt in range(TRIES): + suffix_length = SUFFIX_LENGTH + 2 * attempt + tmp_path = self._tmp_path(local_path, suffix_length) + try: + return tmp_path, open(tmp_path, "xb") + except FileExistsError: + pass # Try again + + raise OutputDirException(f"Failed to create temporary file {tmp_path}") async def download( self, @@ -306,19 +352,7 @@ class OutputDirectory: # Ensure parent directory exists local_path.parent.mkdir(parents=True, exist_ok=True) - # Create tmp file - for attempt in range(TRIES): - suffix_length = SUFFIX_LENGTH + 2 * attempt - tmp_path = self._tmp_path(local_path, suffix_length) - info = DownloadInfo(path, local_path, tmp_path, - heuristics, on_conflict) - try: - file = open(tmp_path, "xb") - return self._sink_context_manager(file, info) - except FileExistsError: - pass # Try again - - return None + return FileSinkToken(self, path, local_path, heuristics, on_conflict) def _update_metadata(self, info: DownloadInfo) -> None: if mtime := info.heuristics.mtime: diff --git a/PFERD/utils.py b/PFERD/utils.py index 3022ab6..0b3d40d 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -2,7 +2,11 @@ import asyncio import contextvars import functools import getpass -from typing import Any, Callable, Optional, TypeVar +import sys +from abc import ABC, abstractmethod +from contextlib import AsyncExitStack +from types import TracebackType +from typing import Any, Callable, Generic, Optional, Type, TypeVar import bs4 @@ -56,3 +60,42 @@ async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: return default print("Please answer with 'y' or 'n'.") + + +class ReusableAsyncContextManager(ABC, Generic[T]): + def __init__(self) -> None: + self._active = False + self._stack = AsyncExitStack() + + @abstractmethod + async def _on_aenter(self) -> T: + pass + + async def __aenter__(self) -> T: + if self._active: + raise RuntimeError("Nested or otherwise concurrent usage is not allowed") + + self._active = True + await self._stack.__aenter__() + + # See https://stackoverflow.com/a/13075071 + try: + result: T = await self._on_aenter() + except: # noqa: E722 do not use bare 'except' + if not await self.__aexit__(*sys.exc_info()): + raise + + return result + + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> Optional[bool]: + if not self._active: + raise RuntimeError("__aexit__ called too many times") + + result = await self._stack.__aexit__(exc_type, exc_value, traceback) + self._active = False + return result From 5916626399e920cfa314c84a74f597bc6f305114 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 17:16:59 +0200 Subject: [PATCH 149/524] Make noqua comment more specific --- PFERD/crawlers/ilias.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias.py index beac208..be3584c 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias.py @@ -831,7 +831,6 @@ async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> Beautifu return soupify(await response.read()) _link_template_plain = "{{link}}" -# flake8: noqa E501 _link_template_rich = """ @@ -917,4 +916,4 @@ _link_template_rich = """ -""" +""" # noqa: E501 line too long From 92886fb8d8104d3a56d370bb0a72a51062bda81a Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 17:32:23 +0200 Subject: [PATCH 150/524] Implement --version flag --- PFERD/__main__.py | 10 ++++++++++ PFERD/crawler.py | 4 ++-- PFERD/version.py | 3 ++- setup.cfg | 2 +- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 5815f40..54228a5 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -6,8 +6,14 @@ from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException from .output_dir import OnConflict, Redownload from .pferd import Pferd +from .version import NAME, VERSION GENERAL_PARSER = argparse.ArgumentParser(add_help=False) +GENERAL_PARSER.add_argument( + "--version", + action="store_true", + help="print version and exit" +) GENERAL_PARSER.add_argument( "--config", "-c", type=Path, @@ -211,6 +217,10 @@ def prune_crawlers( def main() -> None: args = PARSER.parse_args() + if args.version: + print(f"{NAME} {VERSION}") + exit() + try: config = Config(load_parser(args)) except ConfigLoadException: diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 96745d1..adfe74b 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -14,7 +14,7 @@ from .limiter import Limiter from .logging import ProgressBar, log from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload from .transformer import Transformer -from .version import __version__ +from .version import NAME, VERSION class CrawlWarning(Exception): @@ -275,7 +275,7 @@ class HttpCrawler(Crawler): pass async with aiohttp.ClientSession( - headers={"User-Agent": f"pferd/{__version__}"}, + headers={"User-Agent": f"{NAME}/{VERSION}"}, cookie_jar=cookie_jar, ) as session: self.session = session diff --git a/PFERD/version.py b/PFERD/version.py index 528787c..e26dabb 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1 +1,2 @@ -__version__ = "3.0.0" +NAME = "PFERD" +VERSION = "3.0.0" diff --git a/setup.cfg b/setup.cfg index f6b64ea..cb85ab0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = PFERD -version = attr: PFERD.version.__version__ +version = attr: PFERD.version.VERSION [options] packages = PFERD From 0d10752b5a9f68d2f0bd97ac5003bf2690027d58 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 17:48:51 +0200 Subject: [PATCH 151/524] Configure explain log level via cli and config file --- CONFIG.md | 2 ++ PFERD/__main__.py | 19 +++++++++++++++++++ PFERD/config.py | 23 ++++++++++++++--------- PFERD/crawler.py | 2 +- PFERD/crawlers/local.py | 2 +- PFERD/logging.py | 19 +++++++------------ 6 files changed, 44 insertions(+), 23 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 6149ef5..29fc7e2 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -19,6 +19,8 @@ default values for the other sections. paths in the config file are interpreted relative to this path. If this path is relative, it is interpreted relative to the script's working dir. `~` is expanded to the current user's home directory. (Default: `.`) +- `explain`: Whether PFERD should log and explain its actions and decisions in + detail. (Default: `no`) ## The `crawl:*` sections diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 54228a5..589c12d 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -4,6 +4,7 @@ import configparser from pathlib import Path from .config import Config, ConfigDumpException, ConfigLoadException +from .logging import log from .output_dir import OnConflict, Redownload from .pferd import Pferd from .version import NAME, VERSION @@ -42,6 +43,13 @@ GENERAL_PARSER.add_argument( metavar="PATH", help="custom working directory" ) +GENERAL_PARSER.add_argument( + "--explain", "-e", + # TODO Use argparse.BooleanOptionalAction after updating to 3.9 + action="store_const", + const=True, + help="log and explain in detail what PFERD is doing" +) def load_general( @@ -52,6 +60,8 @@ def load_general( if args.working_dir is not None: section["working_dir"] = str(args.working_dir) + if args.explain is not None: + section["explain"] = "true" if args.explain else "false" CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) @@ -217,6 +227,10 @@ def prune_crawlers( def main() -> None: args = PARSER.parse_args() + # Configure log levels set by command line arguments + if args.explain is not None: + log.output_explain = args.explain + if args.version: print(f"{NAME} {VERSION}") exit() @@ -226,6 +240,11 @@ def main() -> None: except ConfigLoadException: exit(1) + # Configure log levels set in the config file + # TODO Catch config section exceptions + if args.explain is None: + log.output_explain = config.default_section.explain() + if args.dump_config is not None: try: if args.dump_config is True: diff --git a/PFERD/config.py b/PFERD/config.py index 7fe5d9e..08beb0c 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -50,6 +50,15 @@ class Section: self.error(key, "Missing value") +class DefaultSection(Section): + def working_dir(self) -> Path: + pathstr = self.s.get("working_dir", ".") + return Path(pathstr).expanduser() + + def explain(self) -> bool: + return self.s.getboolean("explain", fallback=False) + + class Config: @staticmethod def _default_path() -> Path: @@ -62,6 +71,11 @@ class Config: def __init__(self, parser: ConfigParser): self._parser = parser + self._default_section = DefaultSection(parser[parser.default_section]) + + @property + def default_section(self) -> DefaultSection: + return self._default_section @staticmethod def _fail_load(path: Path, reason: str) -> None: @@ -134,10 +148,6 @@ class Config: def dump_to_stdout(self) -> None: self._parser.write(sys.stdout) - @property - def default_section(self) -> SectionProxy: - return self._parser[self._parser.default_section] - def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): @@ -153,8 +163,3 @@ class Config: result.append((name, proxy)) return result - - @property - def working_dir(self) -> Path: - pathstr = self.default_section.get("working_dir", ".") - return Path(pathstr).expanduser() diff --git a/PFERD/crawler.py b/PFERD/crawler.py index adfe74b..80ecedb 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -184,7 +184,7 @@ class Crawler(ABC): self._transformer = Transformer(section.transform()) self._output_dir = OutputDirectory( - config.working_dir / section.output_dir(name), + config.default_section.working_dir() / section.output_dir(name), section.redownload(), section.on_conflict(), ) diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index d4156bc..8cfc79a 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -46,7 +46,7 @@ class LocalCrawler(Crawler): ): super().__init__(name, section, config) - self._target = config.working_dir / section.target() + self._target = config.default_section.working_dir() / section.target() self._crawl_delay = section.crawl_delay() self._download_delay = section.download_delay() self._download_speed = section.download_speed() diff --git a/PFERD/logging.py b/PFERD/logging.py index b075d35..cedc5c9 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -52,9 +52,9 @@ class Log: self._lines: List[str] = [] # Whether different parts of the output are enabled or disabled - self._enabled_explain = False - self._enabled_action = True - self._enabled_report = True + self.output_explain = False + self.output_action = True + self.output_report = True def _update_live(self) -> None: elements = [] @@ -66,11 +66,6 @@ class Log: group = RenderGroup(*elements) # type: ignore self._live.update(group) - def configure(self, explain: bool, action: bool, report: bool) -> None: - self._enabled_explain = explain - self._enabled_action = action - self._enabled_report = report - @contextmanager def show_progress(self) -> Iterator[None]: if self._showing_progress: @@ -107,19 +102,19 @@ class Log: self.console.print(text) def explain_topic(self, text: str) -> None: - if self._enabled_explain: + if self.output_explain: self.print(f"[cyan]{escape(text)}") def explain(self, text: str) -> None: - if self._enabled_explain: + if self.output_explain: self.print(f" {escape(text)}") def action(self, text: str) -> None: - if self._enabled_action: + if self.output_action: self.print(text) def report(self, text: str) -> None: - if self._enabled_report: + if self.output_report: self.print(text) @contextmanager From 3300886120a0a21127c69f7eaf5af0cb246cae24 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 19 May 2021 18:10:17 +0200 Subject: [PATCH 152/524] Explain config file loading --- PFERD/__main__.py | 16 ++++++++++++---- PFERD/config.py | 23 ++++++++++++----------- PFERD/logging.py | 9 +++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 589c12d..c03e08c 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -133,7 +133,7 @@ SUBPARSERS = PARSER.add_subparsers(title="crawlers") LOCAL_CRAWLER = SUBPARSERS.add_parser( "local", - parents=[GENERAL_PARSER, CRAWLER_PARSER], + parents=[CRAWLER_PARSER], ) LOCAL_CRAWLER.set_defaults(command="local") LOCAL_CRAWLER_GROUP = LOCAL_CRAWLER.add_argument_group( @@ -194,12 +194,16 @@ def load_local_crawler( def load_parser( args: argparse.Namespace, ) -> configparser.ConfigParser: + log.explain_topic("Loading config") parser = configparser.ConfigParser() if args.command is None: + log.explain("No CLI command specified, loading config from file") Config.load_parser(parser, path=args.config) - elif args.command == "local": - load_local_crawler(args, parser) + else: + log.explain(f"CLI command specified, creating config for {args.command!r}") + if args.command == "local": + load_local_crawler(args, parser) load_general(args, parser) prune_crawlers(args, parser) @@ -230,6 +234,8 @@ def main() -> None: # Configure log levels set by command line arguments if args.explain is not None: log.output_explain = args.explain + if args.dump_config: + log.output_explain = False if args.version: print(f"{NAME} {VERSION}") @@ -237,7 +243,9 @@ def main() -> None: try: config = Config(load_parser(args)) - except ConfigLoadException: + except ConfigLoadException as e: + log.error(f"Failed to load config file at path {str(e.path)!r}") + log.error_contd(f"Reason: {e.reason}") exit(1) # Configure log levels set in the config file diff --git a/PFERD/config.py b/PFERD/config.py index 08beb0c..30ae3fb 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -6,11 +6,14 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, List, NoReturn, Optional, Tuple +from .logging import log from .utils import prompt_yes_no +@dataclass class ConfigLoadException(Exception): - pass + path: Path + reason: str class ConfigDumpException(Exception): @@ -77,20 +80,18 @@ class Config: def default_section(self) -> DefaultSection: return self._default_section - @staticmethod - def _fail_load(path: Path, reason: str) -> None: - print(f"Failed to load config file at {path}") - print(f"Reason: {reason}") - raise ConfigLoadException() - @staticmethod def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: """ May throw a ConfigLoadException. """ - if not path: + if path: + log.explain("Using custom path") + else: + log.explain("Using default path") path = Config._default_path() + log.explain(f"Loading {str(path)!r}") # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. @@ -98,11 +99,11 @@ class Config: with open(path) as f: parser.read_file(f, source=str(path)) except FileNotFoundError: - Config._fail_load(path, "File does not exist") + raise ConfigLoadException(path, "File does not exist") except IsADirectoryError: - Config._fail_load(path, "That's a directory, not a file") + raise ConfigLoadException(path, "That's a directory, not a file") except PermissionError: - Config._fail_load(path, "Insufficient permissions") + raise ConfigLoadException(path, "Insufficient permissions") @staticmethod def _fail_dump(path: Path, reason: str) -> None: diff --git a/PFERD/logging.py b/PFERD/logging.py index cedc5c9..e2a6d33 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -101,6 +101,15 @@ class Log: else: self.console.print(text) + def warn(self, text: str) -> None: + self.print(f"[bold bright_red]Warning[/] {escape(text)}") + + def error(self, text: str) -> None: + self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") + + def error_contd(self, text: str) -> None: + self.print(f"[red]{escape(text)}") + def explain_topic(self, text: str) -> None: if self.output_explain: self.print(f"[cyan]{escape(text)}") From 9f03702e69a9f09a8d7df6ad49378d3f15ae7bf4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 21:34:36 +0200 Subject: [PATCH 153/524] Split up ilias crawler in multiple files The ilias crawler contained a crawler and an HTML parser, now they are split in two. --- PFERD/crawlers/ilias/__init__.py | 3 + PFERD/crawlers/ilias/kit_ilias_html.py | 452 +++++++++++++++++ .../kit_web_ilias_crawler.py} | 477 +----------------- PFERD/utils.py | 27 +- 4 files changed, 488 insertions(+), 471 deletions(-) create mode 100644 PFERD/crawlers/ilias/__init__.py create mode 100644 PFERD/crawlers/ilias/kit_ilias_html.py rename PFERD/crawlers/{ilias.py => ilias/kit_web_ilias_crawler.py} (51%) diff --git a/PFERD/crawlers/ilias/__init__.py b/PFERD/crawlers/ilias/__init__.py new file mode 100644 index 0000000..15b8d5d --- /dev/null +++ b/PFERD/crawlers/ilias/__init__.py @@ -0,0 +1,3 @@ +from .kit_web_ilias_crawler import KitIliasCrawler, KitIliasCrawlerSection + +__all__ = ["KitIliasCrawler", "KitIliasCrawlerSection"] diff --git a/PFERD/crawlers/ilias/kit_ilias_html.py b/PFERD/crawlers/ilias/kit_ilias_html.py new file mode 100644 index 0000000..17eb855 --- /dev/null +++ b/PFERD/crawlers/ilias/kit_ilias_html.py @@ -0,0 +1,452 @@ +import json +import re +from dataclasses import dataclass +from datetime import date, datetime, timedelta +from enum import Enum +# TODO In Python 3.9 and above, AsyncContextManager is deprecated +from typing import List, Optional, Union +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup, Tag + +from PFERD.utils import url_set_query_params + +TargetType = Union[str, int] + + +class IliasElementType(Enum): + EXERCISE = "exercise" + FILE = "file" + FOLDER = "folder" + FORUM = "forum" + LINK = "link" + MEETING = "meeting" + VIDEO = "video" + VIDEO_PLAYER = "video_player" + VIDEO_FOLDER = "video_folder" + VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + + +@dataclass +class IliasPageElement: + type: IliasElementType + url: str + name: str + mtime: Optional[datetime] = None + description: Optional[str] = None + + +class IliasPage: + + def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): + self._soup = soup + self._page_url = _page_url + self._page_type = source_element.type if source_element else None + self._source_name = source_element.name if source_element else "" + + def get_child_elements(self) -> List[IliasPageElement]: + """ + Return all child page elements you can find here. + """ + if self._is_video_player(): + return self._player_to_video() + if self._is_video_listing(): + return self._find_video_entries() + if self._is_exercise_file(): + return self._find_exercise_entries() + return self._find_normal_entries() + + def _is_video_player(self) -> bool: + return "paella_config_file" in str(self._soup) + + def _is_video_listing(self) -> bool: + # ILIAS fluff around it + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "opencast" in element.attrs["src"].lower(): + return True + + # Raw listing without ILIAS fluff + video_element_table: Tag = self._soup.find( + name="table", id=re.compile(r"tbl_xoct_.+") + ) + return video_element_table is not None + + def _is_exercise_file(self) -> bool: + # we know it from before + if self._page_type == IliasElementType.EXERCISE: + return True + + # We have no suitable parent - let's guesss + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "exc" in element.attrs["src"].lower(): + return True + + return False + + def _player_to_video(self) -> List[IliasPageElement]: + # Fetch the actual video page. This is a small wrapper page initializing a javscript + # player. Sadly we can not execute that JS. The actual video stream url is nowhere + # on the page, but defined in a JS object inside a script tag, passed to the player + # library. + # We do the impossible and RegEx the stream JSON object out of the page's HTML source + regex: re.Pattern[str] = re.compile( + r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE + ) + json_match = regex.search(str(self._soup)) + + if json_match is None: + print(f"Could not find json stream info for {self._page_url!r}") + return [] + json_str = json_match.group(1) + + # parse it + json_object = json.loads(json_str) + # and fetch the video url! + video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] + return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + + def _find_video_entries(self) -> List[IliasPageElement]: + # ILIAS has three stages for video pages + # 1. The initial dummy page without any videos. This page contains the link to the listing + # 2. The video listing which might be paginated + # 3. An unpaginated video listing (or at least one that includes 800 videos) + # + # We need to figure out where we are. + + video_element_table: Tag = self._soup.find( + name="table", id=re.compile(r"tbl_xoct_.+") + ) + + if video_element_table is None: + # We are in stage 1 + # The page is actually emtpy but contains the link to stage 2 + content_link: Tag = self._soup.select_one("#tab_series a") + url: str = self._abs_url_from_link(content_link) + query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} + url = url_set_query_params(url, query_params) + return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + + is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None + + if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + # We are in stage 2 - try to break pagination + return self._find_video_entries_paginated() + + return self._find_video_entries_no_paging() + + def _find_video_entries_paginated(self) -> List[IliasPageElement]: + table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) + + if table_element is None: + # TODO: Properly log this + print( + "Could not increase elements per page (table not found)." + " Some might not be crawled!" + ) + return self._find_video_entries_no_paging() + + id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) + if id_match is None: + # TODO: Properly log this + print( + "Could not increase elements per page (table id not found)." + " Some might not be crawled!" + ) + return self._find_video_entries_no_paging() + + table_id = id_match.group(1) + + query_params = {f"tbl_xoct_{table_id}_trows": "800", + "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} + url = url_set_query_params(self._page_url, query_params) + return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] + + def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + """ + Crawls the "second stage" video page. This page contains the actual video urls. + """ + # Video start links are marked with an "Abspielen" link + video_links: List[Tag] = self._soup.findAll( + name="a", text=re.compile(r"\s*Abspielen\s*") + ) + + results: List[IliasPageElement] = [] + + # TODO: Sadly the download button is currently broken, so never do that + for link in video_links: + results.append(self._listed_video_to_element(link)) + + return results + + def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + # The link is part of a table with multiple columns, describing metadata. + # 6th child (1 indexed) is the modification time string + modification_string = link.parent.parent.parent.select_one( + "td.std:nth-child(6)" + ).getText().strip() + modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + + title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() + title += ".mp4" + + video_name: str = _sanitize_path_name(title) + + video_url = self._abs_url_from_link(link) + + return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + + def _find_exercise_entries(self) -> List[IliasPageElement]: + results: List[IliasPageElement] = [] + + # Each assignment is in an accordion container + assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") + + for container in assignment_containers: + # Fetch the container name out of the header to use it in the path + container_name = container.select_one(".ilAssignmentHeader").getText().strip() + # Find all download links in the container (this will contain all the files) + files: List[Tag] = container.findAll( + name="a", + # download links contain the given command class + attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, + text="Download" + ) + + # Grab each file as you now have the link + for file_link in files: + # Two divs, side by side. Left is the name, right is the link ==> get left + # sibling + file_name = file_link.parent.findPrevious(name="div").getText().strip() + file_name = _sanitize_path_name(file_name) + url = self._abs_url_from_link(file_link) + + results.append(IliasPageElement( + IliasElementType.FILE, + url, + container_name + "/" + file_name, + None # We do not have any timestamp + )) + + return results + + def _find_normal_entries(self) -> List[IliasPageElement]: + result: List[IliasPageElement] = [] + + # Fetch all links and throw them to the general interpreter + links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") + + for link in links: + abs_url = self._abs_url_from_link(link) + element_name = _sanitize_path_name(link.getText()) + element_type = self._find_type_from_link(element_name, link, abs_url) + description = self._find_link_description(link) + + if not element_type: + continue + if element_type == IliasElementType.MEETING: + element_name = _sanitize_path_name(self._normalize_meeting_name(element_name)) + elif element_type == IliasElementType.FILE: + result.append(self._file_to_element(element_name, abs_url, link)) + continue + + result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) + + return result + + def _find_link_description(self, link: Tag) -> Optional[str]: + tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) + if not tile: + return None + description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) + if not description_element: + return None + return description_element.getText().strip() + + def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: + # Files have a list of properties (type, modification date, size, etc.) + # In a series of divs. + # Find the parent containing all those divs, so we can filter our what we need + properties_parent: Tag = link_element.findParent( + "div", {"class": lambda x: "il_ContainerListItem" in x} + ).select_one(".il_ItemProperties") + # The first one is always the filetype + file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() + + # The rest does not have a stable order. Grab the whole text and reg-ex the date + # out of it + all_properties_text = properties_parent.getText().strip() + modification_date_match = re.search( + r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", + all_properties_text + ) + if modification_date_match is None: + modification_date = None + # TODO: Properly log this + print(f"Could not extract start date from {all_properties_text!r}") + else: + modification_date_str = modification_date_match.group(1) + modification_date = demangle_date(modification_date_str) + + # Grab the name from the link text + name = _sanitize_path_name(link_element.getText()) + full_path = name + "." + file_type + + return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) + + @staticmethod + def _find_type_from_link( + element_name: str, + link_element: Tag, + url: str + ) -> Optional[IliasElementType]: + """ + Decides which sub crawler to use for a given top level element. + """ + parsed_url = urlparse(url) + + # file URLs contain "target=file" + if "target=file_" in parsed_url.query: + return IliasElementType.FILE + + # Skip forums + if "cmd=showThreads" in parsed_url.query: + return IliasElementType.FORUM + + # Everything with a ref_id can *probably* be opened to reveal nested things + # video groups, directories, exercises, etc + if "ref_id=" in parsed_url.query: + return IliasPage._find_type_from_folder_like(link_element, url) + + # TODO: Log this properly + print(f"Unknown type: The element was at {str(element_name)!r} and it is {link_element!r})") + return None + + @staticmethod + def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: + """ + Try crawling something that looks like a folder. + """ + # pylint: disable=too-many-return-statements + + found_parent: Optional[Tag] = None + + # We look for the outer div of our inner link, to find information around it + # (mostly the icon) + for parent in link_element.parents: + if "ilContainerListItemOuter" in parent["class"]: + found_parent = parent + break + + if found_parent is None: + # TODO: Log this properly + print(f"Could not find element icon for {url!r}") + return None + + # Find the small descriptive icon to figure out the type + img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") + + if img_tag is None: + # TODO: Log this properly + print(f"Could not find image tag for {url!r}") + return None + + if "opencast" in str(img_tag["alt"]).lower(): + return IliasElementType.VIDEO_FOLDER + + if str(img_tag["src"]).endswith("icon_exc.svg"): + return IliasElementType.EXERCISE + + if str(img_tag["src"]).endswith("icon_webr.svg"): + return IliasElementType.LINK + + if str(img_tag["src"]).endswith("frm.svg"): + return IliasElementType.FORUM + + if str(img_tag["src"]).endswith("sess.svg"): + return IliasElementType.MEETING + + return IliasElementType.FOLDER + + @staticmethod + def _normalize_meeting_name(meeting_name: str) -> str: + """ + Normalizes meeting names, which have a relative time as their first part, + to their date in ISO format. + """ + date_portion_str = meeting_name.split(" - ")[0] + date_portion = demangle_date(date_portion_str) + + if not date_portion: + return meeting_name + + rest_of_name = meeting_name + if rest_of_name.startswith(date_portion_str): + rest_of_name = rest_of_name[len(date_portion_str):] + + return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + + def _abs_url_from_link(self, link_tag: Tag) -> str: + """ + Create an absolute url from an tag. + """ + return urljoin(self._page_url, link_tag.get("href")) + + +german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] +english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + + +def demangle_date(date_str: str) -> Optional[datetime]: + """ + Demangle a given date in one of the following formats: + "Gestern, HH:MM" + "Heute, HH:MM" + "Morgen, HH:MM" + "dd. mon yyyy, HH:MM + """ + try: + date_str = re.sub(r"\s+", " ", date_str) + date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) + date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) + date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + for german, english in zip(german_months, english_months): + date_str = date_str.replace(german, english) + # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" + date_str = date_str.replace(english + ".", english) + + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" + day_part, time_part = date_str.split(",") + day_str, month_str, year_str = day_part.split(" ") + + day = int(day_str.strip().replace(".", "")) + month = english_months.index(month_str.strip()) + 1 + year = int(year_str.strip()) + + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + + return datetime(year, month, day, hour, minute) + except Exception: + # TODO: Properly log this + print(f"Could not parse date {date_str!r}") + return None + + +def _format_date_english(date_to_format: date) -> str: + month = english_months[date_to_format.month - 1] + return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}" + + +def _yesterday() -> date: + return date.today() - timedelta(days=1) + + +def _tomorrow() -> date: + return date.today() + timedelta(days=1) + + +def _sanitize_path_name(name: str) -> str: + return name.replace("/", "-").replace("\\", "-").strip() diff --git a/PFERD/crawlers/ilias.py b/PFERD/crawlers/ilias/kit_web_ilias_crawler.py similarity index 51% rename from PFERD/crawlers/ilias.py rename to PFERD/crawlers/ilias/kit_web_ilias_crawler.py index be3584c..be613e6 100644 --- a/PFERD/crawlers/ilias.py +++ b/PFERD/crawlers/ilias/kit_web_ilias_crawler.py @@ -1,23 +1,19 @@ import asyncio -import json import re -from dataclasses import dataclass -from datetime import date, datetime, timedelta -from enum import Enum from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import Any, Dict, List, Optional, Set, Union -from urllib.parse import parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit +from typing import Any, Dict, Optional, Set, Union import aiohttp from bs4 import BeautifulSoup, Tag +from PFERD.authenticators import Authenticator +from PFERD.config import Config +from PFERD.crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat from PFERD.output_dir import Redownload -from PFERD.utils import soupify +from PFERD.utils import soupify, url_set_query_param -from ..authenticators import Authenticator -from ..config import Config -from ..crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat +from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -58,465 +54,6 @@ class KitIliasCrawlerSection(CrawlerSection): return self.s.getboolean("link_file_plain_text", fallback=False) -class IliasElementType(Enum): - EXERCISE = "exercise" - FILE = "file" - FOLDER = "folder" - FORUM = "forum" - LINK = "link" - MEETING = "meeting" - VIDEO = "video" - VIDEO_PLAYER = "video_player" - VIDEO_FOLDER = "video_folder" - VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" - - -@dataclass -class IliasPageElement: - type: IliasElementType - url: str - name: str - mtime: Optional[datetime] = None - description: Optional[str] = None - - -class IliasPage: - - def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): - self._soup = soup - self._page_url = _page_url - self._page_type = source_element.type if source_element else None - self._source_name = source_element.name if source_element else "" - - def get_child_elements(self) -> List[IliasPageElement]: - """ - Return all child page elements you can find here. - """ - if self._is_video_player(): - return self._player_to_video() - if self._is_video_listing(): - return self._find_video_entries() - if self._is_exercise_file(): - return self._find_exercise_entries() - return self._find_normal_entries() - - def _is_video_player(self) -> bool: - return "paella_config_file" in str(self._soup) - - def _is_video_listing(self) -> bool: - # ILIAS fluff around it - if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): - return True - - # Raw listing without ILIAS fluff - video_element_table: Tag = self._soup.find( - name="table", id=re.compile(r"tbl_xoct_.+") - ) - return video_element_table is not None - - def _is_exercise_file(self) -> bool: - # we know it from before - if self._page_type == IliasElementType.EXERCISE: - return True - - # We have no suitable parent - let's guesss - if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "exc" in element.attrs["src"].lower(): - return True - - return False - - def _player_to_video(self) -> List[IliasPageElement]: - # Fetch the actual video page. This is a small wrapper page initializing a javscript - # player. Sadly we can not execute that JS. The actual video stream url is nowhere - # on the page, but defined in a JS object inside a script tag, passed to the player - # library. - # We do the impossible and RegEx the stream JSON object out of the page's HTML source - regex: re.Pattern[str] = re.compile( - r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE - ) - json_match = regex.search(str(self._soup)) - - if json_match is None: - print(f"Could not find json stream info for {self._page_url!r}") - return [] - json_str = json_match.group(1) - - # parse it - json_object = json.loads(json_str) - # and fetch the video url! - video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] - - def _find_video_entries(self) -> List[IliasPageElement]: - # ILIAS has three stages for video pages - # 1. The initial dummy page without any videos. This page contains the link to the listing - # 2. The video listing which might be paginated - # 3. An unpaginated video listing (or at least one that includes 800 videos) - # - # We need to figure out where we are. - - video_element_table: Tag = self._soup.find( - name="table", id=re.compile(r"tbl_xoct_.+") - ) - - if video_element_table is None: - # We are in stage 1 - # The page is actually emtpy but contains the link to stage 2 - content_link: Tag = self._soup.select_one("#tab_series a") - url: str = self._abs_url_from_link(content_link) - query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - url = _url_set_query_params(url, query_params) - return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] - - is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None - - if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: - # We are in stage 2 - try to break pagination - return self._find_video_entries_paginated() - - return self._find_video_entries_no_paging() - - def _find_video_entries_paginated(self) -> List[IliasPageElement]: - table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) - - if table_element is None: - # TODO: Properly log this - print( - "Could not increase elements per page (table not found)." - " Some might not be crawled!" - ) - return self._find_video_entries_no_paging() - - id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) - if id_match is None: - # TODO: Properly log this - print( - "Could not increase elements per page (table id not found)." - " Some might not be crawled!" - ) - return self._find_video_entries_no_paging() - - table_id = id_match.group(1) - - query_params = {f"tbl_xoct_{table_id}_trows": "800", - "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} - url = _url_set_query_params(self._page_url, query_params) - return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] - - def _find_video_entries_no_paging(self) -> List[IliasPageElement]: - """ - Crawls the "second stage" video page. This page contains the actual video urls. - """ - # Video start links are marked with an "Abspielen" link - video_links: List[Tag] = self._soup.findAll( - name="a", text=re.compile(r"\s*Abspielen\s*") - ) - - results: List[IliasPageElement] = [] - - # TODO: Sadly the download button is currently broken, so never do that - for link in video_links: - results.append(self._listed_video_to_element(link)) - - return results - - def _listed_video_to_element(self, link: Tag) -> IliasPageElement: - # The link is part of a table with multiple columns, describing metadata. - # 6th child (1 indexed) is the modification time string - modification_string = link.parent.parent.parent.select_one( - "td.std:nth-child(6)" - ).getText().strip() - modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") - - title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() - title += ".mp4" - - video_name: str = _sanitize_path_name(title) - - video_url = self._abs_url_from_link(link) - - return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) - - def _find_exercise_entries(self) -> List[IliasPageElement]: - results: List[IliasPageElement] = [] - - # Each assignment is in an accordion container - assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") - - for container in assignment_containers: - # Fetch the container name out of the header to use it in the path - container_name = container.select_one(".ilAssignmentHeader").getText().strip() - # Find all download links in the container (this will contain all the files) - files: List[Tag] = container.findAll( - name="a", - # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, - text="Download" - ) - - # Grab each file as you now have the link - for file_link in files: - # Two divs, side by side. Left is the name, right is the link ==> get left - # sibling - file_name = file_link.parent.findPrevious(name="div").getText().strip() - file_name = _sanitize_path_name(file_name) - url = self._abs_url_from_link(file_link) - - results.append(IliasPageElement( - IliasElementType.FILE, - url, - container_name + "/" + file_name, - None # We do not have any timestamp - )) - - return results - - def _find_normal_entries(self) -> List[IliasPageElement]: - result: List[IliasPageElement] = [] - - # Fetch all links and throw them to the general interpreter - links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") - - for link in links: - abs_url = self._abs_url_from_link(link) - element_name = _sanitize_path_name(link.getText()) - element_type = self._find_type_from_link(element_name, link, abs_url) - description = self._find_link_description(link) - - if not element_type: - continue - if element_type == IliasElementType.MEETING: - element_name = _sanitize_path_name(self._normalize_meeting_name(element_name)) - elif element_type == IliasElementType.FILE: - result.append(self._file_to_element(element_name, abs_url, link)) - continue - - result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) - - return result - - def _find_link_description(self, link: Tag) -> Optional[str]: - tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) - if not tile: - return None - description_element: Tag = tile.find("div", {"class": lambda x: x and "il_Description" in x}) - if not description_element: - return None - return description_element.getText().strip() - - def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: - # Files have a list of properties (type, modification date, size, etc.) - # In a series of divs. - # Find the parent containing all those divs, so we can filter our what we need - properties_parent: Tag = link_element.findParent( - "div", {"class": lambda x: "il_ContainerListItem" in x} - ).select_one(".il_ItemProperties") - # The first one is always the filetype - file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip() - - # The rest does not have a stable order. Grab the whole text and reg-ex the date - # out of it - all_properties_text = properties_parent.getText().strip() - modification_date_match = re.search( - r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", - all_properties_text - ) - if modification_date_match is None: - modification_date = None - # TODO: Properly log this - print(f"Could not extract start date from {all_properties_text!r}") - else: - modification_date_str = modification_date_match.group(1) - modification_date = demangle_date(modification_date_str) - - # Grab the name from the link text - name = _sanitize_path_name(link_element.getText()) - full_path = name + "." + file_type - - return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) - - @staticmethod - def _find_type_from_link( - element_name: str, - link_element: Tag, - url: str - ) -> Optional[IliasElementType]: - """ - Decides which sub crawler to use for a given top level element. - """ - parsed_url = urlparse(url) - - # file URLs contain "target=file" - if "target=file_" in parsed_url.query: - return IliasElementType.FILE - - # Skip forums - if "cmd=showThreads" in parsed_url.query: - return IliasElementType.FORUM - - # Everything with a ref_id can *probably* be opened to reveal nested things - # video groups, directories, exercises, etc - if "ref_id=" in parsed_url.query: - return IliasPage._find_type_from_folder_like(link_element, url) - - # TODO: Log this properly - print(f"Unknown type: The element was at {str(element_name)!r} and it is {link_element!r})") - return None - - @staticmethod - def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: - """ - Try crawling something that looks like a folder. - """ - # pylint: disable=too-many-return-statements - - found_parent: Optional[Tag] = None - - # We look for the outer div of our inner link, to find information around it - # (mostly the icon) - for parent in link_element.parents: - if "ilContainerListItemOuter" in parent["class"]: - found_parent = parent - break - - if found_parent is None: - # TODO: Log this properly - print(f"Could not find element icon for {url!r}") - return None - - # Find the small descriptive icon to figure out the type - img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") - - if img_tag is None: - # TODO: Log this properly - print(f"Could not find image tag for {url!r}") - return None - - if "opencast" in str(img_tag["alt"]).lower(): - return IliasElementType.VIDEO_FOLDER - - if str(img_tag["src"]).endswith("icon_exc.svg"): - return IliasElementType.EXERCISE - - if str(img_tag["src"]).endswith("icon_webr.svg"): - return IliasElementType.LINK - - if str(img_tag["src"]).endswith("frm.svg"): - return IliasElementType.FORUM - - if str(img_tag["src"]).endswith("sess.svg"): - return IliasElementType.MEETING - - return IliasElementType.FOLDER - - @staticmethod - def _normalize_meeting_name(meeting_name: str) -> str: - """ - Normalizes meeting names, which have a relative time as their first part, - to their date in ISO format. - """ - date_portion_str = meeting_name.split(" - ")[0] - date_portion = demangle_date(date_portion_str) - - if not date_portion: - return meeting_name - - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name - - def _abs_url_from_link(self, link_tag: Tag) -> str: - """ - Create an absolute url from an tag. - """ - return urljoin(self._page_url, link_tag.get("href")) - - -german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] -english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - - -def demangle_date(date_str: str) -> Optional[datetime]: - """ - Demangle a given date in one of the following formats: - "Gestern, HH:MM" - "Heute, HH:MM" - "Morgen, HH:MM" - "dd. mon yyyy, HH:MM - """ - try: - date_str = re.sub(r"\s+", " ", date_str) - date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) - date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) - date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) - for german, english in zip(german_months, english_months): - date_str = date_str.replace(german, english) - # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" - date_str = date_str.replace(english + ".", english) - - # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" - day_part, time_part = date_str.split(",") - day_str, month_str, year_str = day_part.split(" ") - - day = int(day_str.strip().replace(".", "")) - month = english_months.index(month_str.strip()) + 1 - year = int(year_str.strip()) - - hour_str, minute_str = time_part.split(":") - hour = int(hour_str) - minute = int(minute_str) - - return datetime(year, month, day, hour, minute) - except Exception: - # TODO: Properly log this - print(f"Could not parse date {date_str!r}") - return None - - -def _format_date_english(date_to_format: date) -> str: - month = english_months[date_to_format.month - 1] - return f"{date_to_format.day:02d}. {month} {date_to_format.year:04d}" - - -def _yesterday() -> date: - return date.today() - timedelta(days=1) - - -def _tomorrow() -> date: - return date.today() + timedelta(days=1) - - -def _sanitize_path_name(name: str) -> str: - return name.replace("/", "-").replace("\\", "-").strip() - - -def _url_set_query_param(url: str, param: str, value: str) -> str: - """ - Set a query parameter in an url, overwriting existing ones with the same name. - """ - scheme, netloc, path, query, fragment = urlsplit(url) - query_parameters = parse_qs(query) - query_parameters[param] = [value] - new_query_string = urlencode(query_parameters, doseq=True) - - return urlunsplit((scheme, netloc, path, new_query_string, fragment)) - - -def _url_set_query_params(url: str, params: Dict[str, str]) -> str: - result = url - - for key, val in params.items(): - result = _url_set_query_param(result, key, val) - - return result - - _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, IliasElementType.FOLDER, @@ -559,7 +96,7 @@ class KitIliasCrawler(HttpCrawler): async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course - root_url = _url_set_query_param( + root_url = url_set_query_param( self._base_url + "/goto.php", "target", f"crs_{course_id}" ) diff --git a/PFERD/utils.py b/PFERD/utils.py index 0b3d40d..56d6f53 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -6,7 +6,8 @@ import sys from abc import ABC, abstractmethod from contextlib import AsyncExitStack from types import TracebackType -from typing import Any, Callable, Generic, Optional, Type, TypeVar +from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar +from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit import bs4 @@ -38,6 +39,30 @@ def soupify(data: bytes) -> bs4.BeautifulSoup: return bs4.BeautifulSoup(data, "html.parser") +def url_set_query_param(url: str, param: str, value: str) -> str: + """ + Set a query parameter in an url, overwriting existing ones with the same name. + """ + scheme, netloc, path, query, fragment = urlsplit(url) + query_parameters = parse_qs(query) + query_parameters[param] = [value] + new_query_string = urlencode(query_parameters, doseq=True) + + return urlunsplit((scheme, netloc, path, new_query_string, fragment)) + + +def url_set_query_params(url: str, params: Dict[str, str]) -> str: + """ + Sets multiple query parameters in an url, overwriting existing ones. + """ + result = url + + for key, val in params.items(): + result = url_set_query_param(result, key, val) + + return result + + async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: """ Asks the user a yes/no question and returns their choice. From 2976b4d352ac86f718d95c8a193a8bc198615b6b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 21:37:10 +0200 Subject: [PATCH 154/524] Move ILIAS file templates to own file --- PFERD/crawlers/ilias/file_templates.py | 87 ++++++++++++++++++ PFERD/crawlers/ilias/kit_web_ilias_crawler.py | 91 +------------------ 2 files changed, 89 insertions(+), 89 deletions(-) create mode 100644 PFERD/crawlers/ilias/file_templates.py diff --git a/PFERD/crawlers/ilias/file_templates.py b/PFERD/crawlers/ilias/file_templates.py new file mode 100644 index 0000000..e9e332e --- /dev/null +++ b/PFERD/crawlers/ilias/file_templates.py @@ -0,0 +1,87 @@ +link_template_plain = "{{link}}" +link_template_rich = """ + + + + + ILIAS - Link: {{name}} + + + + + +
+ +
+ +
{{description}}
+
+ +
+ + +""" # noqa: E501 line too long diff --git a/PFERD/crawlers/ilias/kit_web_ilias_crawler.py b/PFERD/crawlers/ilias/kit_web_ilias_crawler.py index be613e6..46eb662 100644 --- a/PFERD/crawlers/ilias/kit_web_ilias_crawler.py +++ b/PFERD/crawlers/ilias/kit_web_ilias_crawler.py @@ -13,6 +13,7 @@ from PFERD.crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat from PFERD.output_dir import Redownload from PFERD.utils import soupify, url_set_query_param +from .file_templates import link_template_plain, link_template_rich from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -176,7 +177,7 @@ class KitIliasCrawler(HttpCrawler): real_url: str = html_page.select_one("a").get("href").strip() async with dl as sink: - content = _link_template_plain if self._link_file_use_plaintext else _link_template_rich + content = link_template_plain if self._link_file_use_plaintext else link_template_rich content = content.replace("{{link}}", real_url) content = content.replace("{{name}}", element.name) content = content.replace("{{description}}", str(element.description)) @@ -366,91 +367,3 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) - -_link_template_plain = "{{link}}" -_link_template_rich = """ - - - - - ILIAS - Link: {{name}} - - - - - -
- -
-
- {{name}} -
-
{{description}}
-
- -
- - -""" # noqa: E501 line too long From 81301f3a76f741cb8f6db5aae75e1bb146cead5b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 21:41:17 +0200 Subject: [PATCH 155/524] Rename the ilias crawler to ilias web crawler --- PFERD/crawlers/__init__.py | 6 +++--- PFERD/crawlers/ilias/__init__.py | 4 ++-- .../{kit_web_ilias_crawler.py => kit_ilias_web_crawler.py} | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) rename PFERD/crawlers/ilias/{kit_web_ilias_crawler.py => kit_ilias_web_crawler.py} (99%) diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawlers/__init__.py index 72d6798..dc7dfa0 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawlers/__init__.py @@ -4,7 +4,7 @@ from typing import Callable, Dict from ..authenticator import Authenticator from ..config import Config from ..crawler import Crawler -from .ilias import KitIliasCrawler, KitIliasCrawlerSection +from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection from .local import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -17,6 +17,6 @@ CrawlerConstructor = Callable[[ CRAWLERS: Dict[str, CrawlerConstructor] = { "local": lambda n, s, c, a: LocalCrawler(n, LocalCrawlerSection(s), c), - "kit-ilias": lambda n, s, c, a: - KitIliasCrawler(n, KitIliasCrawlerSection(s), c, a), + "kit-ilias-web": lambda n, s, c, a: + KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), } diff --git a/PFERD/crawlers/ilias/__init__.py b/PFERD/crawlers/ilias/__init__.py index 15b8d5d..26618a8 100644 --- a/PFERD/crawlers/ilias/__init__.py +++ b/PFERD/crawlers/ilias/__init__.py @@ -1,3 +1,3 @@ -from .kit_web_ilias_crawler import KitIliasCrawler, KitIliasCrawlerSection +from .kit_ilias_web_crawler import KitIliasWebCrawler, KitIliasWebCrawlerSection -__all__ = ["KitIliasCrawler", "KitIliasCrawlerSection"] +__all__ = ["KitIliasWebCrawler", "KitIliasWebCrawlerSection"] diff --git a/PFERD/crawlers/ilias/kit_web_ilias_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py similarity index 99% rename from PFERD/crawlers/ilias/kit_web_ilias_crawler.py rename to PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 46eb662..9c7793c 100644 --- a/PFERD/crawlers/ilias/kit_web_ilias_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -19,7 +19,7 @@ from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] -class KitIliasCrawlerSection(CrawlerSection): +class KitIliasWebCrawlerSection(CrawlerSection): def target(self) -> TargetType: target = self.s.get("target") @@ -64,11 +64,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ ]) -class KitIliasCrawler(HttpCrawler): +class KitIliasWebCrawler(HttpCrawler): def __init__( self, name: str, - section: KitIliasCrawlerSection, + section: KitIliasWebCrawlerSection, config: Config, authenticators: Dict[str, Authenticator] ): From 8cfa818f04e97713ffd15f9a39e07728211042d8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 21:57:55 +0200 Subject: [PATCH 156/524] Only call should_crawl once --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 9c7793c..82ca8d7 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -130,6 +130,12 @@ class KitIliasWebCrawler(HttpCrawler): @arepeat(3) @anoncritical async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: + # We might not want to crawl this directory-ish page. + # This is not in #handle_element, as the download methods check it themselves and therefore + # would perform this check twice - messing with the explain output + if not self.should_crawl(path): + return + tasks = [] async with self.crawl_bar(path): soup = await self._get_page(url) From e4f9560655b2bd8f56a77f0b126d14b1db61b52c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 May 2021 22:01:09 +0200 Subject: [PATCH 157/524] Only retry on aiohttp errors in ILIAS crawler This patch removes quite a few retries and now only retries the ilias element method. Every other HTTP-interacting method (except for the root requests) is called from there and should be covered. In the future we also want to retry the root a few times, but that will be done after the download sink API is adjusted. --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 82ca8d7..a025127 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -2,14 +2,14 @@ import asyncio import re from pathlib import PurePath # TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import Any, Dict, Optional, Set, Union +from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp from bs4 import BeautifulSoup, Tag from PFERD.authenticators import Authenticator from PFERD.config import Config -from PFERD.crawler import CrawlerSection, HttpCrawler, anoncritical, arepeat +from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical from PFERD.output_dir import Redownload from PFERD.utils import soupify, url_set_query_param @@ -63,6 +63,29 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) + + +def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: + def decorator(f: AWrapped) -> AWrapped: + async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None: + for _ in range(attempts - 1): + try: + await f(self, *args, **kwargs) + return + except aiohttp.ContentTypeError: # invalid content type + raise CrawlWarning("ILIAS returned an invalid content type") + except aiohttp.TooManyRedirects: + raise CrawlWarning("Got stuck in a redirect loop") + except aiohttp.ClientPayloadError: # encoding or not enough bytes + pass + except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc. + pass + + await f(self, *args, **kwargs) + return wrapper # type: ignore + return decorator + class KitIliasWebCrawler(HttpCrawler): def __init__( @@ -106,7 +129,6 @@ class KitIliasWebCrawler(HttpCrawler): async def _crawl_desktop(self) -> None: await self._crawl_url(self._base_url) - @arepeat(3) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: tasks = [] @@ -127,8 +149,6 @@ class KitIliasWebCrawler(HttpCrawler): await asyncio.gather(*tasks) - @arepeat(3) - @anoncritical async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: # We might not want to crawl this directory-ish page. # This is not in #handle_element, as the download methods check it themselves and therefore @@ -147,12 +167,10 @@ class KitIliasWebCrawler(HttpCrawler): await asyncio.gather(*tasks) @anoncritical + @_iorepeat(3) async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) - if not self.should_crawl(element_path): - return - if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: @@ -170,7 +188,6 @@ class KitIliasWebCrawler(HttpCrawler): # TODO: Proper exception raise RuntimeError(f"Unknown type: {element.type!r}") - @arepeat(3) async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: @@ -191,7 +208,6 @@ class KitIliasWebCrawler(HttpCrawler): sink.file.write(content.encode("utf-8")) sink.done() - @arepeat(3) async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) @@ -212,7 +228,6 @@ class KitIliasWebCrawler(HttpCrawler): sink.done() - @arepeat(3) async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: From 83d12fcf2d75650033154c77926728798a4bb541 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 20 May 2021 14:58:54 +0200 Subject: [PATCH 158/524] Add some explains to ilias crawler and use crawler exceptions --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index a025127..88732c0 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -6,10 +6,12 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp from bs4 import BeautifulSoup, Tag +from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config -from PFERD.crawler import CrawlerSection, CrawlWarning, HttpCrawler, anoncritical +from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical +from PFERD.logging import log from PFERD.output_dir import Redownload from PFERD.utils import soupify, url_set_query_param @@ -66,10 +68,11 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) -def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: +def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): + last_exception: Optional[BaseException] = None + for round in range(attempts): try: await f(self, *args, **kwargs) return @@ -77,12 +80,17 @@ def _iorepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: raise CrawlWarning("ILIAS returned an invalid content type") except aiohttp.TooManyRedirects: raise CrawlWarning("Got stuck in a redirect loop") - except aiohttp.ClientPayloadError: # encoding or not enough bytes - pass - except aiohttp.ClientConnectionError: # e.g. timeout, disconnect, resolve failed, etc. - pass + except aiohttp.ClientPayloadError as e: # encoding or not enough bytes + last_exception = e + except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. + last_exception = e + log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}") + + if last_exception: + message = f"Error in I/O Operation: {escape(str(last_exception))}" + raise CrawlWarning(message) from last_exception + raise CrawlError("Impossible return in ilias _iorepeat") - await f(self, *args, **kwargs) return wrapper # type: ignore return decorator @@ -109,14 +117,19 @@ class KitIliasWebCrawler(HttpCrawler): async def crawl(self) -> None: if isinstance(self._target, int): + log.explain_topic(f"Inferred crawl target: Course with id {self._target}") await self._crawl_course(self._target) elif self._target == "desktop": + log.explain_topic("Inferred crawl target: Personal desktop") await self._crawl_desktop() else: + log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") await self._crawl_url(self._target) if self.error_free: await self.cleanup() + else: + log.explain_topic("Skipping file cleanup as errors occurred earlier") async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course @@ -132,15 +145,16 @@ class KitIliasWebCrawler(HttpCrawler): async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: tasks = [] + # TODO: Retry this when the crawl and download bar are reworked async with self.crawl_bar(PurePath("Root element")): soup = await self._get_page(url) if expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") if not perma_link_element or "crs_" not in perma_link_element.get("value"): - # TODO: Properly handle error - raise RuntimeError( - "Invalid course id? I didn't find anything looking like a course!") + raise CrawlError( + "Invalid course id? I didn't find anything looking like a course" + ) # Duplicated code, but the root page is special - we want to void fetching it twice! page = IliasPage(soup, url, None) @@ -167,15 +181,14 @@ class KitIliasWebCrawler(HttpCrawler): await asyncio.gather(*tasks) @anoncritical - @_iorepeat(3) + @_iorepeat(3, "ILIAS element crawling") async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: - # TODO: Delete - print(f"Skipping forum [green]{element_path}[/]") + log.explain_topic(f"Skipping forum at {escape(str(element_path))}") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: @@ -185,8 +198,9 @@ class KitIliasWebCrawler(HttpCrawler): elif element.type in _DIRECTORY_PAGES: await self._handle_ilias_page(element.url, element, element_path) else: - # TODO: Proper exception - raise RuntimeError(f"Unknown type: {element.type!r}") + # This will retry it a few times, failing everytime. It doesn't make any network + # requests, so that's fine. + raise CrawlWarning(f"Unknown element type: {element.type!r}") async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) From 4b104b6252cb5ee97481c0842564922757482f85 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 21 May 2021 12:02:51 +0200 Subject: [PATCH 159/524] Try out some HTTP authentication handling This is by no means final yet and will change a bit once the dl and cl are changed, but it might serve as a first try. It is also wholly untested. --- PFERD/crawler.py | 28 +++++++++ PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 63 ++++++++++++------- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 80ecedb..2f8e5ad 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,3 +1,4 @@ +import asyncio from abc import ABC, abstractmethod from contextlib import asynccontextmanager from datetime import datetime @@ -265,6 +266,33 @@ class HttpCrawler(Crawler): self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._output_dir.register_reserved(self.COOKIE_FILE) + self._authentication_id = 0 + self._authentication_lock = asyncio.Lock() + + async def prepare_request(self) -> int: + # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. + # This should reduce the amount of requests we make: If an authentication is in progress + # all future requests wait for authentication to complete. + async with self._authentication_lock: + return self._authentication_id + + async def authenticate(self, current_id: int) -> None: + async with self._authentication_lock: + # Another thread successfully called authenticate in between + # We do not want to perform auth again, so return here. We can + # assume auth suceeded as authenticate will throw an error if + # it failed. + if current_id != self._authentication_id: + return + await self._authenticate() + self._authentication_id += 1 + + async def _authenticate(self) -> None: + """ + Performs authentication. This method must only return normally if authentication suceeded. + In all other cases it mus either retry internally or throw a terminal exception. + """ + raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") async def run(self) -> None: cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 88732c0..0ca6565 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -5,14 +5,15 @@ from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp +from aiohttp import hdrs from bs4 import BeautifulSoup, Tag from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical -from PFERD.logging import log -from PFERD.output_dir import Redownload +from PFERD.logging import ProgressBar, log +from PFERD.output_dir import FileSink, Redownload from PFERD.utils import soupify, url_set_query_param from .file_templates import link_template_plain, link_template_rich @@ -232,23 +233,24 @@ class KitIliasWebCrawler(HttpCrawler): page = IliasPage(await self._get_page(element.url), element.url, element) real_element = page.get_child_elements()[0] - async with dl as sink, self.session.get(real_element.url) as resp: - if resp.content_length: - bar.set_total(resp.content_length) - - async for data in resp.content.iter_chunked(1024): - sink.file.write(data) - bar.advance(len(data)) - - sink.done() + async with dl as sink: + await self._stream_from_url(real_element.url, sink, bar) async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: return - async with self.download_bar(element_path) as bar: - async with dl as sink, self.session.get(element.url) as resp: + async with self.download_bar(element_path) as bar, dl as sink: + await self._stream_from_url(element.url, sink, bar) + + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def try_stream() -> bool: + async with self.session.get(url, allow_redirects=False) as resp: + # Redirect means we weren't authenticated + if hdrs.LOCATION in resp.headers: + return False + if resp.content_length: bar.set_total(resp.content_length) @@ -257,22 +259,39 @@ class KitIliasWebCrawler(HttpCrawler): bar.advance(len(data)) sink.done() + return True - async def _get_page(self, url: str, retries_left: int = 3) -> BeautifulSoup: - # This function will retry itself a few times if it is not logged in - it won't handle - # connection errors - if retries_left < 0: - # TODO: Proper exception - raise RuntimeError("Get page failed too often") - print(url, "retries left", retries_left) + auth_id = await self.prepare_request() + if await try_stream(): + return + + await self.authenticate(auth_id) + + if not await try_stream(): + raise CrawlError("File streaming failed after authenticate()") + + async def _get_page(self, url: str) -> BeautifulSoup: + auth_id = await self.prepare_request() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): return soup - await self._shibboleth_login.login(self.session) + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) - return await self._get_page(url, retries_left - 1) + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url) as request: + soup = soupify(await request.read()) + if self._is_logged_in(soup): + return soup + raise CrawlError("get_page failed even after authenticating") + + # We repeat this as the login method in shibboleth doesn't handle I/O errors. + # Shibboleth is quite reliable as well, the repeat is likely not critical here. + @_iorepeat(3, "Login") + async def _authenticate(self) -> None: + await self._shibboleth_login.login(self.session) @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: From 98b8ca31faafbc5b27aa6eaa397a6610c2c43f31 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 14:45:32 +0200 Subject: [PATCH 160/524] Add some todos --- PFERD/__main__.py | 6 +++++- PFERD/crawler.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index c03e08c..69feb81 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -266,4 +266,8 @@ def main() -> None: exit() pferd = Pferd(config) - asyncio.run(pferd.run()) + try: + asyncio.run(pferd.run()) + except KeyboardInterrupt: + # TODO Clean up tmp files + pass diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 2f8e5ad..2785e41 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -49,6 +49,7 @@ def noncritical(f: Wrapped) -> Wrapped: log.print(f"[bold bright_red]Warning[/] {escape(str(e))}") crawler.error_free = False except CrawlError as e: + # TODO Don't print error, just pass it on upwards log.print(f"[bold bright_red]Error[/] [red]{escape(str(e))}") crawler.error_free = False raise From b5785f260ed3f1543e95b411b8bc5e6d14b316ae Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 15:03:45 +0200 Subject: [PATCH 161/524] Extract CLI argument parsing to separate module --- PFERD/__main__.py | 189 +----------------------------------------- PFERD/cli/__init__.py | 125 ++++++++++++++++++++++++++++ PFERD/cli/local.py | 67 +++++++++++++++ 3 files changed, 196 insertions(+), 185 deletions(-) create mode 100644 PFERD/cli/__init__.py create mode 100644 PFERD/cli/local.py diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 69feb81..9c60c63 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -3,193 +3,12 @@ import asyncio import configparser from pathlib import Path +from .cli import PARSER, load_default_section from .config import Config, ConfigDumpException, ConfigLoadException from .logging import log -from .output_dir import OnConflict, Redownload from .pferd import Pferd from .version import NAME, VERSION -GENERAL_PARSER = argparse.ArgumentParser(add_help=False) -GENERAL_PARSER.add_argument( - "--version", - action="store_true", - help="print version and exit" -) -GENERAL_PARSER.add_argument( - "--config", "-c", - type=Path, - metavar="PATH", - help="custom config file" -) -GENERAL_PARSER.add_argument( - "--dump-config", - nargs="?", - const=True, - metavar="PATH", - help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified" -) -GENERAL_PARSER.add_argument( - "--crawler", - action="append", - type=str, - metavar="NAME", - help="only execute a single crawler." - " Can be specified multiple times to execute multiple crawlers" -) -GENERAL_PARSER.add_argument( - "--working-dir", - type=Path, - metavar="PATH", - help="custom working directory" -) -GENERAL_PARSER.add_argument( - "--explain", "-e", - # TODO Use argparse.BooleanOptionalAction after updating to 3.9 - action="store_const", - const=True, - help="log and explain in detail what PFERD is doing" -) - - -def load_general( - args: argparse.Namespace, - parser: configparser.ConfigParser, -) -> None: - section = parser[parser.default_section] - - if args.working_dir is not None: - section["working_dir"] = str(args.working_dir) - if args.explain is not None: - section["explain"] = "true" if args.explain else "false" - - -CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) -CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( - title="general crawler arguments", - description="arguments common to all crawlers", -) -CRAWLER_PARSER_GROUP.add_argument( - "--redownload", - type=Redownload.from_string, - metavar="OPTION", - help="when to redownload a file that's already present locally" -) -CRAWLER_PARSER_GROUP.add_argument( - "--on-conflict", - type=OnConflict.from_string, - metavar="OPTION", - help="what to do when local and remote files or directories differ" -) -CRAWLER_PARSER_GROUP.add_argument( - "--transform", "-t", - action="append", - type=str, - metavar="RULE", - help="add a single transformation rule. Can be specified multiple times" -) -CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-tasks", - type=int, - metavar="N", - help="maximum number of concurrent tasks (crawling, downloading)" -) -CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-downloads", - type=int, - metavar="N", - help="maximum number of tasks that may download data at the same time" -) -CRAWLER_PARSER_GROUP.add_argument( - "--delay-between-tasks", - type=float, - metavar="SECONDS", - help="time the crawler should wait between subsequent tasks" -) - - -def load_crawler( - args: argparse.Namespace, - section: configparser.SectionProxy, -) -> None: - if args.redownload is not None: - section["redownload"] = args.redownload.value - if args.on_conflict is not None: - section["on_conflict"] = args.on_conflict.value - if args.transform is not None: - section["transform"] = "\n" + "\n".join(args.transform) - if args.max_concurrent_tasks is not None: - section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) - if args.max_concurrent_downloads is not None: - section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) - if args.delay_between_tasks is not None: - section["delay_between_tasks"] = str(args.delay_between_tasks) - - -PARSER = argparse.ArgumentParser(parents=[GENERAL_PARSER]) -PARSER.set_defaults(command=None) -SUBPARSERS = PARSER.add_subparsers(title="crawlers") - - -LOCAL_CRAWLER = SUBPARSERS.add_parser( - "local", - parents=[CRAWLER_PARSER], -) -LOCAL_CRAWLER.set_defaults(command="local") -LOCAL_CRAWLER_GROUP = LOCAL_CRAWLER.add_argument_group( - title="local crawler arguments", - description="arguments for the 'local' crawler", -) -LOCAL_CRAWLER_GROUP.add_argument( - "target", - type=Path, - metavar="TARGET", - help="directory to crawl" -) -LOCAL_CRAWLER_GROUP.add_argument( - "output", - type=Path, - metavar="OUTPUT", - help="output directory" -) -LOCAL_CRAWLER_GROUP.add_argument( - "--crawl-delay", - type=float, - metavar="SECONDS", - help="artificial delay to simulate for crawl requests" -) -LOCAL_CRAWLER_GROUP.add_argument( - "--download-delay", - type=float, - metavar="SECONDS", - help="artificial delay to simulate for download requests" -) -LOCAL_CRAWLER_GROUP.add_argument( - "--download-speed", - type=int, - metavar="BYTES_PER_SECOND", - help="download speed to simulate" -) - - -def load_local_crawler( - args: argparse.Namespace, - parser: configparser.ConfigParser, -) -> None: - parser["crawl:local"] = {} - section = parser["crawl:local"] - load_crawler(args, section) - - section["type"] = "local" - section["target"] = str(args.target) - section["output_dir"] = str(args.output) - if args.crawl_delay is not None: - section["crawl_delay"] = str(args.crawl_delay) - if args.download_delay is not None: - section["download_delay"] = str(args.download_delay) - if args.download_speed is not None: - section["download_speed"] = str(args.download_speed) - def load_parser( args: argparse.Namespace, @@ -202,10 +21,10 @@ def load_parser( Config.load_parser(parser, path=args.config) else: log.explain(f"CLI command specified, creating config for {args.command!r}") - if args.command == "local": - load_local_crawler(args, parser) + if args.command: + args.command(args, parser) - load_general(args, parser) + load_default_section(args, parser) prune_crawlers(args, parser) return parser diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py new file mode 100644 index 0000000..71d9732 --- /dev/null +++ b/PFERD/cli/__init__.py @@ -0,0 +1,125 @@ +import argparse +import configparser +from pathlib import Path + +from ..output_dir import OnConflict, Redownload + +CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) +CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( + title="general crawler arguments", + description="arguments common to all crawlers", +) +CRAWLER_PARSER_GROUP.add_argument( + "--redownload", + type=Redownload.from_string, + metavar="OPTION", + help="when to redownload a file that's already present locally" +) +CRAWLER_PARSER_GROUP.add_argument( + "--on-conflict", + type=OnConflict.from_string, + metavar="OPTION", + help="what to do when local and remote files or directories differ" +) +CRAWLER_PARSER_GROUP.add_argument( + "--transform", "-t", + action="append", + type=str, + metavar="RULE", + help="add a single transformation rule. Can be specified multiple times" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-tasks", + type=int, + metavar="N", + help="maximum number of concurrent tasks (crawling, downloading)" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-downloads", + type=int, + metavar="N", + help="maximum number of tasks that may download data at the same time" +) +CRAWLER_PARSER_GROUP.add_argument( + "--delay-between-tasks", + type=float, + metavar="SECONDS", + help="time the crawler should wait between subsequent tasks" +) + + +def load_crawler( + args: argparse.Namespace, + section: configparser.SectionProxy, +) -> None: + if args.redownload is not None: + section["redownload"] = args.redownload.value + if args.on_conflict is not None: + section["on_conflict"] = args.on_conflict.value + if args.transform is not None: + section["transform"] = "\n" + "\n".join(args.transform) + if args.max_concurrent_tasks is not None: + section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) + if args.max_concurrent_downloads is not None: + section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) + if args.delay_between_tasks is not None: + section["delay_between_tasks"] = str(args.delay_between_tasks) + + +PARSER = argparse.ArgumentParser() +PARSER.set_defaults(command=None) +PARSER.add_argument( + "--version", + action="store_true", + help="print version and exit" +) +PARSER.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="custom config file" +) +PARSER.add_argument( + "--dump-config", + nargs="?", + const=True, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified" +) +PARSER.add_argument( + "--crawler", + action="append", + type=str, + metavar="NAME", + help="only execute a single crawler." + " Can be specified multiple times to execute multiple crawlers" +) +PARSER.add_argument( + "--working-dir", + type=Path, + metavar="PATH", + help="custom working directory" +) +PARSER.add_argument( + "--explain", "-e", + # TODO Use argparse.BooleanOptionalAction after updating to 3.9 + action="store_const", + const=True, + help="log and explain in detail what PFERD is doing" +) + + +def load_default_section( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + section = parser[parser.default_section] + + if args.working_dir is not None: + section["working_dir"] = str(args.working_dir) + if args.explain is not None: + section["explain"] = "true" if args.explain else "false" + + +SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/cli/local.py b/PFERD/cli/local.py new file mode 100644 index 0000000..5df81db --- /dev/null +++ b/PFERD/cli/local.py @@ -0,0 +1,67 @@ +import argparse +import configparser +from pathlib import Path + +from . import CRAWLER_PARSER, SUBPARSERS, load_crawler + +SUBPARSER = SUBPARSERS.add_parser( + "local", + parents=[CRAWLER_PARSER], +) + +GROUP = SUBPARSER.add_argument_group( + title="local crawler arguments", + description="arguments for the 'local' crawler", +) +GROUP.add_argument( + "target", + type=Path, + metavar="TARGET", + help="directory to crawl" +) +GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) +GROUP.add_argument( + "--crawl-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for crawl requests" +) +GROUP.add_argument( + "--download-delay", + type=float, + metavar="SECONDS", + help="artificial delay to simulate for download requests" +) +GROUP.add_argument( + "--download-speed", + type=int, + metavar="BYTES_PER_SECOND", + help="download speed to simulate" +) + + +def load( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + parser["crawl:local"] = {} + section = parser["crawl:local"] + load_crawler(args, section) + + section["type"] = "local" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + if args.crawl_delay is not None: + section["crawl_delay"] = str(args.crawl_delay) + if args.download_delay is not None: + section["download_delay"] = str(args.download_delay) + if args.download_speed is not None: + section["download_speed"] = str(args.download_speed) + + +SUBPARSER.set_defaults(command=load) From 54dd2f8337a36a70c789ee7f3aa397677b565244 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 16:47:24 +0200 Subject: [PATCH 162/524] Clean up main and improve error handling --- PFERD/__main__.py | 96 +++++++++++++++++++++++++++++++++-------------- PFERD/config.py | 62 ++++++++++++++++-------------- PFERD/logging.py | 23 ++++++++++++ 3 files changed, 125 insertions(+), 56 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 9c60c63..c418095 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -4,15 +4,13 @@ import configparser from pathlib import Path from .cli import PARSER, load_default_section -from .config import Config, ConfigDumpException, ConfigLoadException +from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log from .pferd import Pferd from .version import NAME, VERSION -def load_parser( - args: argparse.Namespace, -) -> configparser.ConfigParser: +def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: log.explain_topic("Loading config") parser = configparser.ConfigParser() @@ -47,46 +45,88 @@ def prune_crawlers( # TODO Check if crawlers actually exist -def main() -> None: - args = PARSER.parse_args() +def load_config(args: argparse.Namespace) -> Config: + try: + return Config(load_config_parser(args)) + except ConfigLoadError as e: + log.error(str(e)) + log.error_contd(e.reason) + exit(1) - # Configure log levels set by command line arguments + +def configure_logging_from_args(args: argparse.Namespace) -> None: if args.explain is not None: log.output_explain = args.explain - if args.dump_config: + + # We want to prevent any unnecessary output if we're printing the config to + # stdout, otherwise it would not be a valid config file. + if args.dump_config == "-": log.output_explain = False + +def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: + # In configure_logging_from_args(), all normal logging is already disabled + # whenever we dump the config. We don't want to override that decision with + # values from the config file. + if args.dump_config == "-": + return + + try: + if args.explain is None: + log.output_explain = config.default_section.explain() + except ConfigOptionError as e: + log.error(str(e)) + exit(1) + + +def dump_config(args: argparse.Namespace, config: Config) -> None: + try: + if args.dump_config is True: + config.dump() + elif args.dump_config == "-": + config.dump_to_stdout() + else: + config.dump(Path(args.dump_config)) + except ConfigDumpError as e: + log.error(str(e)) + log.error_contd(e.reason) + exit(1) + + +def main() -> None: + args = PARSER.parse_args() + if args.version: print(f"{NAME} {VERSION}") exit() - try: - config = Config(load_parser(args)) - except ConfigLoadException as e: - log.error(f"Failed to load config file at path {str(e.path)!r}") - log.error_contd(f"Reason: {e.reason}") - exit(1) + # Configuring logging happens in two stages because CLI args have + # precedence over config file options and loading the config already + # produces some kinds of log messages (usually only explain()-s). + configure_logging_from_args(args) - # Configure log levels set in the config file - # TODO Catch config section exceptions - if args.explain is None: - log.output_explain = config.default_section.explain() + config = load_config(args) + + # Now, after loading the config file, we can apply its logging settings in + # all places that were not already covered by CLI args. + configure_logging_from_config(args, config) if args.dump_config is not None: - try: - if args.dump_config is True: - config.dump() - elif args.dump_config == "-": - config.dump_to_stdout() - else: - config.dump(Path(args.dump_config)) - except ConfigDumpException: - exit(1) + dump_config(args, config) exit() + # TODO Unset exclusive output on exceptions (if it was being held) pferd = Pferd(config) try: asyncio.run(pferd.run()) except KeyboardInterrupt: + log.explain_topic("Interrupted, exiting immediately") + log.explain("Open files and connections are left for the OS to clean up") + log.explain("Temporary files are not cleaned up") # TODO Clean up tmp files - pass + # And when those files *do* actually get cleaned up properly, + # reconsider what exit code to use here. + exit(1) + except Exception: + log.unexpected_exception() + exit(1) diff --git a/PFERD/config.py b/PFERD/config.py index 30ae3fb..26a9eb6 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -2,7 +2,6 @@ import asyncio import os import sys from configparser import ConfigParser, SectionProxy -from dataclasses import dataclass from pathlib import Path from typing import Any, List, NoReturn, Optional, Tuple @@ -10,21 +9,34 @@ from .logging import log from .utils import prompt_yes_no -@dataclass -class ConfigLoadException(Exception): - path: Path - reason: str +class ConfigLoadError(Exception): + """ + Something went wrong while loading the config from a file. + """ + + def __init__(self, path: Path, reason: str): + super().__init__(f"Failed to load config from {path}") + self.path = path + self.reason = reason -class ConfigDumpException(Exception): - pass +class ConfigOptionError(Exception): + """ + An option in the config file has an invalid or missing value. + """ + + def __init__(self, section: str, key: str, desc: str): + super().__init__(f"Section {section!r}, key {key!r}: {desc}") + self.section = section + self.key = key + self.desc = desc -@dataclass -class ConfigFormatException(Exception): - section: str - key: str - desc: str +class ConfigDumpError(Exception): + def __init__(self, path: Path, reason: str): + super().__init__(f"Failed to dump config to {path}") + self.path = path + self.reason = reason class Section: @@ -36,7 +48,7 @@ class Section: self.s = section def error(self, key: str, desc: str) -> NoReturn: - raise ConfigFormatException(self.s.name, key, desc) + raise ConfigOptionError(self.s.name, key, desc) def invalid_value( self, @@ -83,7 +95,7 @@ class Config: @staticmethod def load_parser(parser: ConfigParser, path: Optional[Path] = None) -> None: """ - May throw a ConfigLoadException. + May throw a ConfigLoadError. """ if path: @@ -99,21 +111,15 @@ class Config: with open(path) as f: parser.read_file(f, source=str(path)) except FileNotFoundError: - raise ConfigLoadException(path, "File does not exist") + raise ConfigLoadError(path, "File does not exist") except IsADirectoryError: - raise ConfigLoadException(path, "That's a directory, not a file") + raise ConfigLoadError(path, "That's a directory, not a file") except PermissionError: - raise ConfigLoadException(path, "Insufficient permissions") - - @staticmethod - def _fail_dump(path: Path, reason: str) -> None: - print(f"Failed to dump config file to {path}") - print(f"Reason: {reason}") - raise ConfigDumpException() + raise ConfigLoadError(path, "Insufficient permissions") def dump(self, path: Optional[Path] = None) -> None: """ - May throw a ConfigDumpException. + May throw a ConfigDumpError. """ if not path: @@ -124,7 +130,7 @@ class Config: try: path.parent.mkdir(parents=True, exist_ok=True) except PermissionError: - self._fail_dump(path, "Could not create parent directory") + raise ConfigDumpError(path, "Could not create parent directory") try: # Ensuring we don't accidentally overwrite any existing files by @@ -140,11 +146,11 @@ class Config: with open(path, "w") as f: self._parser.write(f) else: - self._fail_dump(path, "File already exists") + raise ConfigDumpError(path, "File already exists") except IsADirectoryError: - self._fail_dump(path, "That's a directory, not a file") + raise ConfigDumpError(path, "That's a directory, not a file") except PermissionError: - self._fail_dump(path, "Insufficient permissions") + raise ConfigDumpError(path, "Insufficient permissions") def dump_to_stdout(self) -> None: self._parser.write(sys.stdout) diff --git a/PFERD/logging.py b/PFERD/logging.py index e2a6d33..e1ab92f 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -1,4 +1,6 @@ import asyncio +import sys +import traceback from contextlib import asynccontextmanager, contextmanager # TODO In Python 3.9 and above, ContextManager and AsyncContextManager are deprecated from typing import AsyncIterator, ContextManager, Iterator, List, Optional @@ -110,6 +112,27 @@ class Log: def error_contd(self, text: str) -> None: self.print(f"[red]{escape(text)}") + def unexpected_exception(self) -> None: + t, v, tb = sys.exc_info() + + self.error("An unexpected exception occurred") + self.error_contd("") + + for line in traceback.format_tb(tb): + self.error_contd(line[:-1]) # Without trailing newline + + if str(v): + self.error_contd(f"{t.__name__}: {v}") + else: + self.error_contd(t.__name__) + + self.error_contd("") + self.error_contd(""" +An unexpected exception occurred. This usually shouldn't happen. Please copy +your program output and send it to the PFERD maintainers, either directly or as +a GitHub issue: https://github.com/Garmelon/PFERD/issues/new + """.strip()) + def explain_topic(self, text: str) -> None: if self.output_explain: self.print(f"[cyan]{escape(text)}") From dfde0e23107a85455489058918d7731e264355ff Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 18:36:25 +0200 Subject: [PATCH 163/524] Improve reporting of unexpected exceptions --- PFERD/logging.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index e1ab92f..8d89baf 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -113,24 +113,27 @@ class Log: self.print(f"[red]{escape(text)}") def unexpected_exception(self) -> None: + """ + Call this in an "except" clause to log an unexpected exception. + """ + t, v, tb = sys.exc_info() - - self.error("An unexpected exception occurred") - self.error_contd("") - - for line in traceback.format_tb(tb): - self.error_contd(line[:-1]) # Without trailing newline - - if str(v): - self.error_contd(f"{t.__name__}: {v}") + if t is None or v is None or tb is None: + # We're not currently handling an exception, so somebody probably + # called this function where they shouldn't. + self.error("Something unexpected happened") + self.error_contd("") + for line in traceback.format_stack(): + self.error_contd(line[:-1]) # Without the newline + self.error_contd("") else: - self.error_contd(t.__name__) + self.error("An unexpected exception occurred") + self.error_contd("") + self.error_contd(traceback.format_exc()) - self.error_contd("") self.error_contd(""" -An unexpected exception occurred. This usually shouldn't happen. Please copy -your program output and send it to the PFERD maintainers, either directly or as -a GitHub issue: https://github.com/Garmelon/PFERD/issues/new +Please copy your program output and send it to the PFERD maintainers, either +directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new """.strip()) def explain_topic(self, text: str) -> None: From 552cd82802dad0c53c8cdb971d70f70bff1dc5da Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 18:37:53 +0200 Subject: [PATCH 164/524] Run async input and password getters in daemon thread Previously, it ran in the event loop's default executor, which would block until all its workers were done working. If Ctrl+C was pressed while input or a password were being read, the asyncio.run() call in the main thread would be interrupted however, not the input thread. This meant that multiple key presses (either enter or a second Ctrl+C) were necessary to stop a running PFERD in some circumstances. This change instead runs the input functions in daemon threads so they exit as soon as the main thread exits. --- PFERD/utils.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/PFERD/utils.py b/PFERD/utils.py index 56d6f53..1d11565 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -1,8 +1,7 @@ import asyncio -import contextvars -import functools import getpass import sys +import threading from abc import ABC, abstractmethod from contextlib import AsyncExitStack from types import TracebackType @@ -14,21 +13,25 @@ import bs4 T = TypeVar("T") -# TODO When switching to 3.9, use asyncio.to_thread instead of this -async def to_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: - # https://github.com/python/cpython/blob/8d47f92d46a92a5931b8f3dcb4a484df672fc4de/Lib/asyncio/threads.py - loop = asyncio.get_event_loop() - ctx = contextvars.copy_context() - func_call = functools.partial(ctx.run, func, *args, **kwargs) - return await loop.run_in_executor(None, func_call) # type: ignore +async def in_daemon_thread(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: + loop = asyncio.get_running_loop() + future: asyncio.Future[T] = asyncio.Future() + + def thread_func() -> None: + result = func() + loop.call_soon_threadsafe(future.set_result, result) + + threading.Thread(target=thread_func, daemon=True).start() + + return await future async def ainput(prompt: str) -> str: - return await to_thread(lambda: input(prompt)) + return await in_daemon_thread(lambda: input(prompt)) async def agetpass(prompt: str) -> str: - return await to_thread(lambda: getpass.getpass(prompt)) + return await in_daemon_thread(lambda: getpass.getpass(prompt)) def soupify(data: bytes) -> bs4.BeautifulSoup: From afac22c5626b1967001ae9d9fe4bf975a35c9701 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 18:58:00 +0200 Subject: [PATCH 165/524] Handle abort in exclusive output state correctly If the event loop is stopped while something holds the exclusive output, the "log" singleton is now reset so the main thread can print a few more messages before exiting. --- PFERD/__main__.py | 3 ++- PFERD/logging.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index c418095..d588836 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -115,11 +115,11 @@ def main() -> None: dump_config(args, config) exit() - # TODO Unset exclusive output on exceptions (if it was being held) pferd = Pferd(config) try: asyncio.run(pferd.run()) except KeyboardInterrupt: + log.unlock() log.explain_topic("Interrupted, exiting immediately") log.explain("Open files and connections are left for the OS to clean up") log.explain("Temporary files are not cleaned up") @@ -128,5 +128,6 @@ def main() -> None: # reconsider what exit code to use here. exit(1) except Exception: + log.unlock() log.unexpected_exception() exit(1) diff --git a/PFERD/logging.py b/PFERD/logging.py index 8d89baf..beb92c6 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -97,12 +97,28 @@ class Log: self.print(line) self._lines = [] + def unlock(self) -> None: + """ + Get rid of an exclusive output state. + + This function is meant to let PFERD print log messages after the event + loop was forcibly stopped and if it will not be started up again. After + this is called, it is not safe to use any functions except the logging + functions (print, warn, ...). + """ + + self._progress_suspended = False + for line in self._lines: + self.print(line) + def print(self, text: str) -> None: if self._progress_suspended: self._lines.append(text) else: self.console.print(text) + # TODO Print errors (and warnings?) to stderr + def warn(self, text: str) -> None: self.print(f"[bold bright_red]Warning[/] {escape(text)}") From b4d97cd545a03a2e8bb2fee1b43cf6a1d431dbeb Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:54:42 +0200 Subject: [PATCH 166/524] Improve output dir and report error handling --- PFERD/crawler.py | 11 +++++------ PFERD/output_dir.py | 31 +++++++++++-------------------- PFERD/report.py | 25 +++++++++++++------------ 3 files changed, 29 insertions(+), 38 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 2785e41..1269ba2 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -13,7 +13,8 @@ from .authenticator import Authenticator from .config import Config, Section from .limiter import Limiter from .logging import ProgressBar, log -from .output_dir import FileSink, OnConflict, OutputDirectory, Redownload +from .output_dir import FileSink, OnConflict, OutputDirectory, OutputDirError, Redownload +from .report import MarkConflictError, MarkDuplicateError from .transformer import Transformer from .version import NAME, VERSION @@ -45,12 +46,10 @@ def noncritical(f: Wrapped) -> Wrapped: try: f(*args, **kwargs) - except CrawlWarning as e: - log.print(f"[bold bright_red]Warning[/] {escape(str(e))}") + except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: + log.warn(str(e)) crawler.error_free = False - except CrawlError as e: - # TODO Don't print error, just pass it on upwards - log.print(f"[bold bright_red]Error[/] [red]{escape(str(e))}") + except CrawlError: crawler.error_free = False raise diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 783d6bc..ee4910e 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -14,7 +14,7 @@ from typing import AsyncContextManager, BinaryIO, Iterator, Optional, Tuple from rich.markup import escape from .logging import log -from .report import MarkConflictException, MarkDuplicateException, Report +from .report import Report from .utils import ReusableAsyncContextManager, prompt_yes_no SUFFIX_CHARS = string.ascii_lowercase + string.digits @@ -22,7 +22,7 @@ SUFFIX_LENGTH = 6 TRIES = 5 -class OutputDirException(Exception): +class OutputDirError(Exception): pass @@ -146,25 +146,15 @@ class OutputDirectory: def register_reserved(self, path: PurePath) -> None: self._report.mark_reserved(path) - def _mark(self, path: PurePath) -> None: - """ - May throw an OutputDirException - """ - - try: - self._report.mark(path) - except MarkDuplicateException: - raise OutputDirException("Another file has already been placed here.") - except MarkConflictException as e: - raise OutputDirException(f"Collides with other file: {e.collides_with}") - def resolve(self, path: PurePath) -> Path: """ - May throw an OutputDirException. + May throw an OutputDirError. """ if ".." in path.parts: - raise OutputDirException(f"Path {path} contains forbidden '..'") + raise OutputDirError(f"Forbidden segment '..' in path {path}") + if "." in path.parts: + raise OutputDirError(f"Forbidden segment '.' in path {path}") return self._root / path def _should_download( @@ -297,7 +287,7 @@ class OutputDirectory: local_path: Path, ) -> Tuple[Path, BinaryIO]: """ - May raise an OutputDirException. + May raise an OutputDirError. """ # Create tmp file @@ -309,7 +299,7 @@ class OutputDirectory: except FileExistsError: pass # Try again - raise OutputDirException(f"Failed to create temporary file {tmp_path}") + raise OutputDirError("Failed to create temporary file") async def download( self, @@ -319,7 +309,8 @@ class OutputDirectory: on_conflict: Optional[OnConflict] = None, ) -> Optional[AsyncContextManager[FileSink]]: """ - May throw an OutputDirException. + May throw an OutputDirError, a MarkDuplicateError or a + MarkConflictError. """ heuristics = Heuristics(mtime) @@ -327,7 +318,7 @@ class OutputDirectory: on_conflict = self._on_conflict if on_conflict is None else on_conflict local_path = self.resolve(path) - self._mark(path) + self._report.mark(path) if not self._should_download(local_path, heuristics, redownload): return None diff --git a/PFERD/report.py b/PFERD/report.py index 1c46216..7d8aa85 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -1,19 +1,18 @@ -from dataclasses import dataclass from pathlib import PurePath from typing import Set -@dataclass -class MarkDuplicateException(Exception): +class MarkDuplicateError(Exception): """ Tried to mark a file that was already marked. """ - path: PurePath + def __init__(self, path: PurePath): + super().__init__(f"A previous file already used path {path}") + self.path = path -@dataclass -class MarkConflictException(Exception): +class MarkConflictError(Exception): """ Marking the path would have caused a conflict. @@ -24,8 +23,10 @@ class MarkConflictException(Exception): usually not possible. """ - path: PurePath - collides_with: PurePath + def __init__(self, path: PurePath, collides_with: PurePath): + super().__init__(f"File at {path} collides with previous file at {collides_with}") + self.path = path + self.collides_with = collides_with # TODO Use PurePath.is_relative_to when updating to 3.9 @@ -58,16 +59,16 @@ class Report: """ Mark a previously unknown file as known. - May throw a MarkDuplicateException or a MarkConflictException. For more - detail, see the respective exception's docstring. + May throw a MarkDuplicateError or a MarkConflictError. For more detail, + see the respective exception's docstring. """ for other in self.marked: if path == other: - raise MarkDuplicateException(path) + raise MarkDuplicateError(path) if is_relative_to(path, other) or is_relative_to(other, path): - raise MarkConflictException(path, other) + raise MarkConflictError(path, other) self.known_files.add(path) From 9889ce6b57b041e9ba84c003855876a155273ba9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 21:05:32 +0200 Subject: [PATCH 167/524] Improve PFERD error handling --- PFERD/__main__.py | 7 ++++- PFERD/pferd.py | 65 +++++++++++++++++++---------------------------- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index d588836..9a307b2 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -115,7 +115,12 @@ def main() -> None: dump_config(args, config) exit() - pferd = Pferd(config) + try: + pferd = Pferd(config) + except ConfigOptionError as e: + log.error(str(e)) + exit(1) + try: asyncio.run(pferd.run()) except KeyboardInterrupt: diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 10cd1c2..20c770f 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,71 +1,58 @@ from typing import Dict -from rich import print from rich.markup import escape from .authenticator import Authenticator from .authenticators import AUTHENTICATORS -from .config import Config -from .crawler import Crawler +from .config import Config, ConfigOptionError +from .crawler import Crawler, CrawlError from .crawlers import CRAWLERS - - -class PferdLoadException(Exception): - pass +from .logging import log class Pferd: def __init__(self, config: Config): + """ + May throw ConfigOptionError. + """ + self._config = config self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} + self._load_authenticators() + self._load_crawlers() + def _load_authenticators(self) -> None: - abort = False for name, section in self._config.authenticator_sections(): - print(f"[bold bright_cyan]Loading[/] {escape(name)}") - authenticator_type = section.get("type") - authenticator_constructor = AUTHENTICATORS.get(authenticator_type) + log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") + auth_type = section.get("type") + authenticator_constructor = AUTHENTICATORS.get(auth_type) if authenticator_constructor is None: - abort = True - t = escape(repr(authenticator_type)) - print(f"[red]Error: Unknown authenticator type {t}") - continue + raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") authenticator = authenticator_constructor(name, section, self._config) self._authenticators[name] = authenticator - if abort: - raise PferdLoadException() - def _load_crawlers(self) -> None: - abort = False for name, section in self._config.crawler_sections(): - print(f"[bold bright_cyan]Loading[/] {escape(name)}") - crawler_type = section.get("type") - crawler_constructor = CRAWLERS.get(crawler_type) + log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") + crawl_type = section.get("type") + crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: - abort = True - t = escape(repr(crawler_type)) - print(f"[red]Error: Unknown crawler type {t}") - continue + raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler - if abort: - raise PferdLoadException() - async def run(self) -> None: - try: - self._load_authenticators() - self._load_crawlers() - except PferdLoadException: - print("[bold red]Could not initialize PFERD properly") - exit(1) - for name, crawler in self._crawlers.items(): - print() - print(f"[bold bright_cyan]Running[/] {escape(name)}") + log.print("") + log.print(f"[bold bright_cyan]Running[/] {escape(name)}") - await crawler.run() + try: + await crawler.run() + except CrawlError as e: + log.error(str(e)) + except Exception: + log.unexpected_exception() From 098ac45758a73f145fc15b67d911b1c9698a8f2d Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 21:06:13 +0200 Subject: [PATCH 168/524] Remove deprecated repeat decorators --- PFERD/crawler.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 1269ba2..f5286b8 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -316,33 +316,3 @@ class HttpCrawler(Crawler): cookie_jar.save(self._cookie_jar_path) except Exception: log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") - - -def repeat(attempts: int) -> Callable[[Wrapped], Wrapped]: - """Deprecated.""" - def decorator(f: Wrapped) -> Wrapped: - def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): - try: - f(self, *args, **kwargs) - return - except Exception: - pass - f(self, *args, **kwargs) - return wrapper # type: ignore - return decorator - - -def arepeat(attempts: int) -> Callable[[AWrapped], AWrapped]: - """Deprecated.""" - def decorator(f: AWrapped) -> AWrapped: - async def wrapper(self: "Crawler", *args: Any, **kwargs: Any) -> None: - for _ in range(attempts - 1): - try: - await f(self, *args, **kwargs) - return - except Exception: - pass - await f(self, *args, **kwargs) - return wrapper # type: ignore - return decorator From ec95dda18f0f65dd6c5638aba0eb5a9c86f03a3a Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 21:36:53 +0200 Subject: [PATCH 169/524] Unify crawling and downloading steps Now, the progress bar, limiter etc. for downloading and crawling are all handled via the reusable CrawlToken and DownloadToken context managers. --- PFERD/crawler.py | 79 +++++++++++++++++++++++++++------------------ PFERD/output_dir.py | 5 ++- 2 files changed, 49 insertions(+), 35 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index f5286b8..42f66a3 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,10 +1,8 @@ import asyncio from abc import ABC, abstractmethod -from contextlib import asynccontextmanager from datetime import datetime from pathlib import Path, PurePath -# TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import Any, AsyncContextManager, AsyncIterator, Awaitable, Callable, Dict, Optional, TypeVar +from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar import aiohttp from rich.markup import escape @@ -13,9 +11,10 @@ from .authenticator import Authenticator from .config import Config, Section from .limiter import Limiter from .logging import ProgressBar, log -from .output_dir import FileSink, OnConflict, OutputDirectory, OutputDirError, Redownload +from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload from .report import MarkConflictError, MarkDuplicateError from .transformer import Transformer +from .utils import ReusableAsyncContextManager from .version import NAME, VERSION @@ -88,6 +87,36 @@ def anoncritical(f: AWrapped) -> AWrapped: return wrapper # type: ignore +class CrawlToken(ReusableAsyncContextManager[ProgressBar]): + def __init__(self, limiter: Limiter, desc: str): + super().__init__() + + self._limiter = limiter + self._desc = desc + + async def _on_aenter(self) -> ProgressBar: + await self._stack.enter_async_context(self._limiter.limit_crawl()) + bar = self._stack.enter_context(log.crawl_bar(self._desc)) + + return bar + + +class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): + def __init__(self, limiter: Limiter, fs_token: FileSinkToken, desc: str): + super().__init__() + + self._limiter = limiter + self._fs_token = fs_token + self._desc = desc + + async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: + await self._stack.enter_async_context(self._limiter.limit_crawl()) + sink = await self._stack.enter_async_context(self._fs_token) + bar = self._stack.enter_context(log.crawl_bar(self._desc)) + + return bar, sink + + class CrawlerSection(Section): def output_dir(self, name: str) -> Path: # TODO Use removeprefix() after switching to 3.9 @@ -190,30 +219,12 @@ class Crawler(ABC): section.on_conflict(), ) - @asynccontextmanager - async def crawl_bar( - self, - path: PurePath, - total: Optional[int] = None, - ) -> AsyncIterator[ProgressBar]: + async def crawl(self, path: PurePath) -> Optional[CrawlToken]: + if self._transformer.transform(path) is None: + return None + desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" - async with self._limiter.limit_crawl(): - with log.crawl_bar(desc, total=total) as bar: - yield bar - - @asynccontextmanager - async def download_bar( - self, - path: PurePath, - total: Optional[int] = None, - ) -> AsyncIterator[ProgressBar]: - desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" - async with self._limiter.limit_download(): - with log.download_bar(desc, total=total) as bar: - yield bar - - def should_crawl(self, path: PurePath) -> bool: - return self._transformer.transform(path) is not None + return CrawlToken(self._limiter, desc) async def download( self, @@ -221,13 +232,17 @@ class Crawler(ABC): mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, - ) -> Optional[AsyncContextManager[FileSink]]: + ) -> Optional[DownloadToken]: transformed_path = self._transformer.transform(path) if transformed_path is None: return None - return await self._output_dir.download( - transformed_path, mtime, redownload, on_conflict) + fs_token = await self._output_dir.download(transformed_path, mtime, redownload, on_conflict) + if fs_token is None: + return None + + desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" + return DownloadToken(self._limiter, fs_token, desc) async def cleanup(self) -> None: await self._output_dir.cleanup() @@ -239,10 +254,10 @@ class Crawler(ABC): """ with log.show_progress(): - await self.crawl() + await self._run() @abstractmethod - async def crawl(self) -> None: + async def _run(self) -> None: """ Overwrite this function if you are writing a crawler. diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index ee4910e..fef6914 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -8,8 +8,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum from pathlib import Path, PurePath -# TODO In Python 3.9 and above, AsyncContextManager is deprecated -from typing import AsyncContextManager, BinaryIO, Iterator, Optional, Tuple +from typing import BinaryIO, Iterator, Optional, Tuple from rich.markup import escape @@ -307,7 +306,7 @@ class OutputDirectory: mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, - ) -> Optional[AsyncContextManager[FileSink]]: + ) -> Optional[FileSinkToken]: """ May throw an OutputDirError, a MarkDuplicateError or a MarkConflictError. From e21795ee357f06526915dcc829c8ff78bfb0f7ca Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 21:45:51 +0200 Subject: [PATCH 170/524] Make file cleanup part of default crawler behaviour --- PFERD/crawler.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 42f66a3..ec0e147 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -244,8 +244,15 @@ class Crawler(ABC): desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" return DownloadToken(self._limiter, fs_token, desc) - async def cleanup(self) -> None: - await self._output_dir.cleanup() + async def _cleanup(self) -> None: + log.explain_topic("Decision: Clean up files?") + if self.error_free: + log.explain("No warnings or errors occurred during this run") + log.explain("Cleaning up files") + await self._output_dir.cleanup() + else: + log.explain("Warnings or errors occurred during this run") + log.explain("Not cleaning up files") async def run(self) -> None: """ @@ -255,6 +262,7 @@ class Crawler(ABC): with log.show_progress(): await self._run() + await self._cleanup() @abstractmethod async def _run(self) -> None: From ae3d80664cee6b03023c854600f157fd83c1c87f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 21:46:05 +0200 Subject: [PATCH 171/524] Update local crawler to new crawler structure --- PFERD/crawlers/local.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 8cfc79a..176f36d 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -56,10 +56,8 @@ class LocalCrawler(Crawler): else: self._block_size = 1024**2 # 1 MiB - async def crawl(self) -> None: + async def _run(self) -> None: await self._crawl_path(self._target, PurePath()) - if self.error_free: - await self.cleanup() @anoncritical async def _crawl_path(self, path: Path, pure: PurePath) -> None: @@ -69,9 +67,13 @@ class LocalCrawler(Crawler): await self._crawl_file(path, pure) async def _crawl_dir(self, path: Path, pure: PurePath) -> None: + cl = await self.crawl(pure) + if not cl: + return + tasks = [] - async with self.crawl_bar(pure): + async with cl: await asyncio.sleep(random.uniform( 0.5 * self._crawl_delay, self._crawl_delay, @@ -79,8 +81,7 @@ class LocalCrawler(Crawler): for child in path.iterdir(): pure_child = pure / child.name - if self.should_crawl(child): - tasks.append(self._crawl_path(child, pure_child)) + tasks.append(self._crawl_path(child, pure_child)) await asyncio.gather(*tasks) @@ -91,7 +92,7 @@ class LocalCrawler(Crawler): if not dl: return - async with self.download_bar(pure) as bar: + async with dl as (bar, sink): await asyncio.sleep(random.uniform( 0.5 * self._download_delay, self._download_delay, @@ -99,19 +100,18 @@ class LocalCrawler(Crawler): bar.set_total(stat.st_size) - async with dl as sink: - with open(path, "rb") as f: - while True: - data = f.read(self._block_size) - if len(data) == 0: - break + with open(path, "rb") as f: + while True: + data = f.read(self._block_size) + if len(data) == 0: + break - sink.file.write(data) - bar.advance(len(data)) + sink.file.write(data) + bar.advance(len(data)) - if self._download_speed: - delay = self._block_size / self._download_speed - delay = random.uniform(0.8 * delay, 1.2 * delay) - await asyncio.sleep(delay) + if self._download_speed: + delay = self._block_size / self._download_speed + delay = random.uniform(0.8 * delay, 1.2 * delay) + await asyncio.sleep(delay) - sink.done() + sink.done() From 8fad8edc1e2ddee4fb46fa85b9f23a65a421d196 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:02:15 +0000 Subject: [PATCH 172/524] Remove duplicated beautifulsoup4 dependency --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index cb85ab0..431c3b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,7 +9,6 @@ install_requires = aiohttp>=3.7.4.post0 beautifulsoup4>=4.9.3 rich>=10.1.0 - beautifulsoup4>=4.9.3 [options.entry_points] console_scripts = From 662191eca9fa38172e541d9eaa7f217301aaef19 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:25:58 +0000 Subject: [PATCH 173/524] Fix crash as soon as first cl or dl token was acquired --- PFERD/__main__.py | 5 ++--- PFERD/pferd.py | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 9a307b2..0e84e34 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -117,12 +117,11 @@ def main() -> None: try: pferd = Pferd(config) + asyncio.run(pferd.run()) except ConfigOptionError as e: + log.unlock() log.error(str(e)) exit(1) - - try: - asyncio.run(pferd.run()) except KeyboardInterrupt: log.unlock() log.explain_topic("Interrupted, exiting immediately") diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 20c770f..4aee043 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -20,9 +20,6 @@ class Pferd: self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} - self._load_authenticators() - self._load_crawlers() - def _load_authenticators(self) -> None: for name, section in self._config.authenticator_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") @@ -46,6 +43,12 @@ class Pferd: self._crawlers[name] = crawler async def run(self) -> None: + # These two functions must run inside the same event loop as the + # crawlers, so that any new objects (like Conditions or Futures) can + # obtain the correct event loop. + self._load_authenticators() + self._load_crawlers() + for name, crawler in self._crawlers.items(): log.print("") log.print(f"[bold bright_cyan]Running[/] {escape(name)}") From 1bbc0b705f29452ed105d95687343173b8af60d7 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:38:56 +0000 Subject: [PATCH 174/524] Improve transformer error handling --- PFERD/__main__.py | 5 +++++ PFERD/transformer.py | 33 ++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 0e84e34..5cc6ef6 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -7,6 +7,7 @@ from .cli import PARSER, load_default_section from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log from .pferd import Pferd +from .transformer import RuleParseError from .version import NAME, VERSION @@ -122,6 +123,10 @@ def main() -> None: log.unlock() log.error(str(e)) exit(1) + except RuleParseError as e: + log.unlock() + e.pretty_print() + exit(1) except KeyboardInterrupt: log.unlock() log.explain_topic("Interrupted, exiting immediately") diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 135baf2..293274a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -6,10 +6,11 @@ import ast import re from abc import ABC, abstractmethod -from dataclasses import dataclass from pathlib import PurePath from typing import Dict, Optional, Union +from .logging import log + class Rule(ABC): @abstractmethod @@ -122,16 +123,18 @@ class ReRule(Rule): return False -@dataclass -class RuleParseException(Exception): - line: "Line" - reason: str +class RuleParseError(Exception): + def __init__(self, line: "Line", reason: str): + super().__init__(f"Error in rule on line {line.line_nr}, column {line.index}: {reason}") + + self.line = line + self.reason = reason def pretty_print(self) -> None: - print(f"Error parsing rule on line {self.line.line_nr}:") - print(self.line.line) + log.error(f"Error parsing rule on line {self.line.line_nr}:") + log.error_contd(self.line.line) spaces = " " * self.line.index - print(f"{spaces}^--- {self.reason}") + log.error_contd(f"{spaces}^--- {self.reason}") class Line: @@ -170,7 +173,7 @@ class Line: if self.get() == char: self.advance() else: - raise RuleParseException(self, f"Expected {char!r}") + raise RuleParseError(self, f"Expected {char!r}") QUOTATION_MARKS = {'"', "'"} @@ -186,7 +189,7 @@ def parse_string_literal(line: Line) -> str: if quotation_mark not in QUOTATION_MARKS: # This should never happen as long as this function is only called from # parse_string. - raise RuleParseException(line, "Invalid quotation mark") + raise RuleParseError(line, "Invalid quotation mark") line.advance() while c := line.get(): @@ -204,7 +207,7 @@ def parse_string_literal(line: Line) -> str: else: line.advance() - raise RuleParseException(line, "Expected end of string literal") + raise RuleParseError(line, "Expected end of string literal") def parse_until_space_or_eol(line: Line) -> str: @@ -235,12 +238,12 @@ def parse_arrow(line: Line) -> str: while True: c = line.get() if not c: - raise RuleParseException(line, "Expected rest of arrow") + raise RuleParseError(line, "Expected rest of arrow") elif c == "-": line.advance() c = line.get() if not c: - raise RuleParseException(line, "Expected rest of arrow") + raise RuleParseError(line, "Expected rest of arrow") elif c == ">": line.advance() break # End of arrow @@ -267,7 +270,7 @@ def parse_rule(line: Line) -> Rule: left = parse_string(line) if isinstance(left, bool): line.index = leftindex - raise RuleParseException(line, "Left side can't be '!'") + raise RuleParseError(line, "Left side can't be '!'") # Parse arrow parse_whitespace(line) @@ -301,7 +304,7 @@ def parse_rule(line: Line) -> Rule: return NameRule(ReRule(left, right)) else: line.index = arrowindex + 1 # For nicer error message - raise RuleParseException(line, "Invalid arrow name") + raise RuleParseError(line, "Invalid arrow name") class Transformer: From 9cb2b68f09504704baf738e71e29ba8f07ec7429 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:39:29 +0000 Subject: [PATCH 175/524] Fix arrow parsing error messages --- PFERD/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 293274a..130473a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -249,7 +249,7 @@ def parse_arrow(line: Line) -> str: break # End of arrow else: name.append("-") - name.append(c) + continue else: name.append(c) @@ -304,7 +304,7 @@ def parse_rule(line: Line) -> Rule: return NameRule(ReRule(left, right)) else: line.index = arrowindex + 1 # For nicer error message - raise RuleParseError(line, "Invalid arrow name") + raise RuleParseError(line, f"Invalid arrow name {arrowname!r}") class Transformer: From 62f0f7bfc5d539ebac21341dbdb3873f86cef5df Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:39:57 +0000 Subject: [PATCH 176/524] Explain crawling and partially explain downloading --- PFERD/crawler.py | 17 ++++++++++++++--- PFERD/transformer.py | 10 ++++++++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index ec0e147..4095c53 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -220,9 +220,14 @@ class Crawler(ABC): ) async def crawl(self, path: PurePath) -> Optional[CrawlToken]: + log.explain_topic(f"Decision: Crawl {path}") + if self._transformer.transform(path) is None: + log.explain("Answer: No") return None + log.explain("Answer: Yes") + desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" return CrawlToken(self._limiter, desc) @@ -233,26 +238,32 @@ class Crawler(ABC): redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, ) -> Optional[DownloadToken]: + log.explain_topic(f"Decision: Download {path}") + transformed_path = self._transformer.transform(path) if transformed_path is None: + log.explain("Answer: No") return None fs_token = await self._output_dir.download(transformed_path, mtime, redownload, on_conflict) if fs_token is None: + log.explain("Answer: No") return None + log.explain("Answer: Yes") + desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" return DownloadToken(self._limiter, fs_token, desc) async def _cleanup(self) -> None: - log.explain_topic("Decision: Clean up files?") + log.explain_topic("Decision: Clean up files") if self.error_free: log.explain("No warnings or errors occurred during this run") - log.explain("Cleaning up files") + log.explain("Answer: Yes") await self._output_dir.cleanup() else: log.explain("Warnings or errors occurred during this run") - log.explain("Not cleaning up files") + log.explain("Answer: No") async def run(self) -> None: """ diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 130473a..d7d3be8 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -317,16 +317,22 @@ class Transformer: for i, line in enumerate(rules.split("\n")): line = line.strip() if line: - self._rules.append(parse_rule(Line(line, i))) + rule = parse_rule(Line(line, i)) + self._rules.append((line, rule)) def transform(self, path: PurePath) -> Optional[PurePath]: - for rule in self._rules: + for i, (line, rule) in enumerate(self._rules): + log.explain(f"Testing rule {i}: {line}") + result = rule.transform(path) if isinstance(result, PurePath): + log.explain(f"Match! Transformed to {result}") return result elif result: # Exclamation mark + log.explain("Match! Ignored") return None else: continue + log.explain("No rule matched, path is unchanged") return path From e724ff7c93b3f05d4125992a0101da2676f5239c Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 22 May 2021 20:44:59 +0000 Subject: [PATCH 177/524] Fix normal arrow --- PFERD/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index d7d3be8..7b5745b 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -51,6 +51,7 @@ class NormalRule(Rule): if left_parts: return None + path_parts.reverse() return PurePath(*path_parts) def transform(self, path: PurePath) -> Union[PurePath, bool]: From 953a1bba93ba67886f4d8f67345801c055f1227d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 22 May 2021 23:18:05 +0200 Subject: [PATCH 178/524] Adjust to new crawl / download names --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 0ca6565..f2125aa 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -116,7 +116,7 @@ class KitIliasWebCrawler(HttpCrawler): self._link_file_redirect_delay = section.link_file_redirect_delay() self._link_file_use_plaintext = section.link_file_use_plaintext() - async def crawl(self) -> None: + async def _run(self) -> None: if isinstance(self._target, int): log.explain_topic(f"Inferred crawl target: Course with id {self._target}") await self._crawl_course(self._target) @@ -127,11 +127,6 @@ class KitIliasWebCrawler(HttpCrawler): log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") await self._crawl_url(self._target) - if self.error_free: - await self.cleanup() - else: - log.explain_topic("Skipping file cleanup as errors occurred earlier") - async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course root_url = url_set_query_param( @@ -144,10 +139,14 @@ class KitIliasWebCrawler(HttpCrawler): await self._crawl_url(self._base_url) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: + cl = await self.crawl(PurePath(".")) + if not cl: + return + tasks = [] # TODO: Retry this when the crawl and download bar are reworked - async with self.crawl_bar(PurePath("Root element")): + async with cl: soup = await self._get_page(url) if expected_id is not None: @@ -165,14 +164,12 @@ class KitIliasWebCrawler(HttpCrawler): await asyncio.gather(*tasks) async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: - # We might not want to crawl this directory-ish page. - # This is not in #handle_element, as the download methods check it themselves and therefore - # would perform this check twice - messing with the explain output - if not self.should_crawl(path): + cl = await self.crawl(path) + if not cl: return tasks = [] - async with self.crawl_bar(path): + async with cl: soup = await self._get_page(url) page = IliasPage(soup, url, parent) @@ -189,7 +186,9 @@ class KitIliasWebCrawler(HttpCrawler): if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Skipping forum at {escape(str(element_path))}") + log.explain_topic(f"Decision: Crawl {escape(str(element_path))}") + log.explain("Is a forum") + log.explain("Answer: No") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: @@ -208,20 +207,19 @@ class KitIliasWebCrawler(HttpCrawler): if not dl: return - async with self.download_bar(element_path): + async with dl as (bar, sink): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") async with self.session.get(export_url) as response: html_page: BeautifulSoup = soupify(await response.read()) real_url: str = html_page.select_one("a").get("href").strip() - async with dl as sink: - content = link_template_plain if self._link_file_use_plaintext else link_template_rich - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + content = link_template_plain if self._link_file_use_plaintext else link_template_rich + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky @@ -229,19 +227,18 @@ class KitIliasWebCrawler(HttpCrawler): if not dl: return - async with self.download_bar(element_path) as bar: + async with dl as (bar, sink): page = IliasPage(await self._get_page(element.url), element.url, element) real_element = page.get_child_elements()[0] - async with dl as sink: - await self._stream_from_url(real_element.url, sink, bar) + await self._stream_from_url(real_element.url, sink, bar) async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: return - async with self.download_bar(element_path) as bar, dl as sink: + async with dl as (bar, sink): await self._stream_from_url(element.url, sink, bar) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: From 4d07de0d717195e445a9c0ad881c40d2d6ea79f2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 22 May 2021 23:20:21 +0200 Subject: [PATCH 179/524] Adjust forum log message in ilias crawler --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index f2125aa..9db9267 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -187,7 +187,7 @@ class KitIliasWebCrawler(HttpCrawler): await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: log.explain_topic(f"Decision: Crawl {escape(str(element_path))}") - log.explain("Is a forum") + log.explain("Forums are not supported") log.explain("Answer: No") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) From 3053278721cd6b6e61ff9f6c60eabb41e5f5ba3e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 22 May 2021 23:23:21 +0200 Subject: [PATCH 180/524] Move HTTP crawler to own file --- PFERD/crawler.py | 68 ----------------- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 3 +- PFERD/http_crawler.py | 75 +++++++++++++++++++ 3 files changed, 77 insertions(+), 69 deletions(-) create mode 100644 PFERD/http_crawler.py diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 4095c53..731cfb9 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,10 +1,8 @@ -import asyncio from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path, PurePath from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar -import aiohttp from rich.markup import escape from .authenticator import Authenticator @@ -15,7 +13,6 @@ from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, Ou from .report import MarkConflictError, MarkDuplicateError from .transformer import Transformer from .utils import ReusableAsyncContextManager -from .version import NAME, VERSION class CrawlWarning(Exception): @@ -285,68 +282,3 @@ class Crawler(ABC): """ pass - - -class HttpCrawler(Crawler): - COOKIE_FILE = PurePath(".cookies") - - def __init__( - self, - name: str, - section: CrawlerSection, - config: Config, - ) -> None: - super().__init__(name, section, config) - - self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) - self._output_dir.register_reserved(self.COOKIE_FILE) - self._authentication_id = 0 - self._authentication_lock = asyncio.Lock() - - async def prepare_request(self) -> int: - # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. - # This should reduce the amount of requests we make: If an authentication is in progress - # all future requests wait for authentication to complete. - async with self._authentication_lock: - return self._authentication_id - - async def authenticate(self, current_id: int) -> None: - async with self._authentication_lock: - # Another thread successfully called authenticate in between - # We do not want to perform auth again, so return here. We can - # assume auth suceeded as authenticate will throw an error if - # it failed. - if current_id != self._authentication_id: - return - await self._authenticate() - self._authentication_id += 1 - - async def _authenticate(self) -> None: - """ - Performs authentication. This method must only return normally if authentication suceeded. - In all other cases it mus either retry internally or throw a terminal exception. - """ - raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") - - async def run(self) -> None: - cookie_jar = aiohttp.CookieJar() - - try: - cookie_jar.load(self._cookie_jar_path) - except Exception: - pass - - async with aiohttp.ClientSession( - headers={"User-Agent": f"{NAME}/{VERSION}"}, - cookie_jar=cookie_jar, - ) as session: - self.session = session - try: - await super().run() - finally: - del self.session - - try: - cookie_jar.save(self._cookie_jar_path) - except Exception: - log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 9db9267..7ffa993 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -11,7 +11,8 @@ from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config -from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, HttpCrawler, anoncritical +from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical +from PFERD.http_crawler import HttpCrawler from PFERD.logging import ProgressBar, log from PFERD.output_dir import FileSink, Redownload from PFERD.utils import soupify, url_set_query_param diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py new file mode 100644 index 0000000..2b025e3 --- /dev/null +++ b/PFERD/http_crawler.py @@ -0,0 +1,75 @@ +import asyncio +from pathlib import PurePath + +import aiohttp +from rich.markup import escape + +from .config import Config +from .crawler import Crawler, CrawlerSection +from .logging import log +from .version import NAME, VERSION + + +class HttpCrawler(Crawler): + COOKIE_FILE = PurePath(".cookies") + + def __init__( + self, + name: str, + section: CrawlerSection, + config: Config, + ) -> None: + super().__init__(name, section, config) + + self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) + self._output_dir.register_reserved(self.COOKIE_FILE) + self._authentication_id = 0 + self._authentication_lock = asyncio.Lock() + + async def prepare_request(self) -> int: + # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. + # This should reduce the amount of requests we make: If an authentication is in progress + # all future requests wait for authentication to complete. + async with self._authentication_lock: + return self._authentication_id + + async def authenticate(self, current_id: int) -> None: + async with self._authentication_lock: + # Another thread successfully called authenticate in between + # We do not want to perform auth again, so return here. We can + # assume auth suceeded as authenticate will throw an error if + # it failed. + if current_id != self._authentication_id: + return + await self._authenticate() + self._authentication_id += 1 + + async def _authenticate(self) -> None: + """ + Performs authentication. This method must only return normally if authentication suceeded. + In all other cases it mus either retry internally or throw a terminal exception. + """ + raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") + + async def run(self) -> None: + cookie_jar = aiohttp.CookieJar() + + try: + cookie_jar.load(self._cookie_jar_path) + except Exception: + pass + + async with aiohttp.ClientSession( + headers={"User-Agent": f"{NAME}/{VERSION}"}, + cookie_jar=cookie_jar, + ) as session: + self.session = session + try: + await super().run() + finally: + del self.session + + try: + cookie_jar.save(self._cookie_jar_path) + except Exception: + log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") From adfdc302d7b0a4f2a24f10afc6f360d9b419e5ff Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 22 May 2021 23:30:32 +0200 Subject: [PATCH 181/524] Save cookies after successful authentication in HTTP crawler --- PFERD/http_crawler.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py index 2b025e3..41bf612 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/http_crawler.py @@ -1,5 +1,6 @@ import asyncio from pathlib import PurePath +from typing import Optional import aiohttp from rich.markup import escape @@ -25,6 +26,7 @@ class HttpCrawler(Crawler): self._output_dir.register_reserved(self.COOKIE_FILE) self._authentication_id = 0 self._authentication_lock = asyncio.Lock() + self._current_cookie_jar: Optional[aiohttp.CookieJar] = None async def prepare_request(self) -> int: # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. @@ -43,6 +45,9 @@ class HttpCrawler(Crawler): return await self._authenticate() self._authentication_id += 1 + # Saving the cookies after the first auth ensures we won't need to re-authenticate + # on the next run, should this one be aborted or crash + await self._save_cookies() async def _authenticate(self) -> None: """ @@ -51,17 +56,29 @@ class HttpCrawler(Crawler): """ raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") - async def run(self) -> None: - cookie_jar = aiohttp.CookieJar() + async def _save_cookies(self) -> None: + log.explain_topic("Saving cookies") + if not self._current_cookie_jar: + log.explain("No cookie jar - save aborted") + return try: - cookie_jar.load(self._cookie_jar_path) + self._current_cookie_jar.save(self._cookie_jar_path) + log.explain(f"Cookies saved to {escape(str(self.COOKIE_FILE))}") + except Exception: + log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") + + async def run(self) -> None: + self._current_cookie_jar = aiohttp.CookieJar() + + try: + self._current_cookie_jar.load(self._cookie_jar_path) except Exception: pass async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, - cookie_jar=cookie_jar, + cookie_jar=self._current_cookie_jar, ) as session: self.session = session try: @@ -69,7 +86,5 @@ class HttpCrawler(Crawler): finally: del self.session - try: - cookie_jar.save(self._cookie_jar_path) - except Exception: - log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") + # They are saved in authenticate, but a final save won't hurt + await self._save_cookies() From 8ac85ea0bd6e2efcb6a1cd03fb31fde0a091ea90 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 22 May 2021 23:37:34 +0200 Subject: [PATCH 182/524] Fix a few typos in HttpCrawler --- PFERD/http_crawler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py index 41bf612..b9cfeea 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/http_crawler.py @@ -37,10 +37,10 @@ class HttpCrawler(Crawler): async def authenticate(self, current_id: int) -> None: async with self._authentication_lock: - # Another thread successfully called authenticate in between - # We do not want to perform auth again, so return here. We can - # assume auth suceeded as authenticate will throw an error if - # it failed. + # Another thread successfully called authenticate in-between + # We do not want to perform auth again, so we return here. We can + # assume the other thread suceeded as authenticate will throw an error + # if it failed and aborts the crawl process. if current_id != self._authentication_id: return await self._authenticate() @@ -52,7 +52,7 @@ class HttpCrawler(Crawler): async def _authenticate(self) -> None: """ Performs authentication. This method must only return normally if authentication suceeded. - In all other cases it mus either retry internally or throw a terminal exception. + In all other cases it must either retry internally or throw a terminal exception. """ raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") From 53e031d9f6202cad3b14a80f0e26967a93e1d79d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 00:28:27 +0200 Subject: [PATCH 183/524] Reuse dl/cl for I/O retries in ILIAS crawler --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 133 ++++++++++++++---- 1 file changed, 105 insertions(+), 28 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 7ffa993..1bdf5e4 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -72,11 +72,11 @@ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: - async def wrapper(self: "HttpCrawler", *args: Any, **kwargs: Any) -> None: + async def wrapper(*args: Any, **kwargs: Any) -> None: last_exception: Optional[BaseException] = None for round in range(attempts): try: - await f(self, *args, **kwargs) + await f(*args, **kwargs) return except aiohttp.ContentTypeError: # invalid content type raise CrawlWarning("ILIAS returned an invalid content type") @@ -97,6 +97,43 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: return decorator +def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: + """ + Wraps any I/O exception in a CrawlWarning. + """ + return _iorepeat(1, name) + + +# Crawler control flow: +# +# crawl_desktop -+ +# | +# crawl_course --+ +# | +# +- crawl_url <-+ +# | +# | +# | @_wrap_io_exception # does not need to retry as children acquire bars +# +> crawl_ilias_element -+ +# ^ | +# | @_io_repeat | # retries internally (before the bar) +# +- crawl_ilias_page <---+ +# | | +# +> get_page | # Handles and retries authentication +# | +# @_io_repeat | # retries internally (before the bar) +# +- download_link <---+ +# | | +# +> resolve_target | # Handles and retries authentication +# | +# @_io_repeat | # retries internally (before the bar) +# +- download_video <---+ +# | | +# | @_io_repeat | # retries internally (before the bar) +# +- download_file <---+ +# | +# +> stream_from_url # Handles and retries authentication + class KitIliasWebCrawler(HttpCrawler): def __init__( self, @@ -169,18 +206,30 @@ class KitIliasWebCrawler(HttpCrawler): if not cl: return - tasks = [] - async with cl: - soup = await self._get_page(url) - page = IliasPage(soup, url, parent) + @_iorepeat(3, "crawling folder") + async def impl() -> None: + assert cl # The function is only reached when cl is not None + tasks = [] + async with cl: + soup = await self._get_page(url) + page = IliasPage(soup, url, parent) - for child in page.get_child_elements(): - tasks.append(self._handle_ilias_element(path, child)) + for child in page.get_child_elements(): + tasks.append(self._handle_ilias_element(path, child)) - await asyncio.gather(*tasks) + # The only point an I/O exception can be thrown is in `get_page`. + # If that happens, no task was spawned yet. Therefore, we only retry + # this method without having spawned a single task. Due to this we do + # not need to cancel anything or worry about this gather call or the forks + # further up. + await asyncio.gather(*tasks) + + await impl() @anoncritical - @_iorepeat(3, "ILIAS element crawling") + # Shouldn't happen but this method must never raise an I/O error as that might interfere with + # handle_ilias_page + @_wrap_io_in_warning("ilias element handling") async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) @@ -208,19 +257,37 @@ class KitIliasWebCrawler(HttpCrawler): if not dl: return - async with dl as (bar, sink): - export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") - async with self.session.get(export_url) as response: - html_page: BeautifulSoup = soupify(await response.read()) - real_url: str = html_page.select_one("a").get("href").strip() + @_iorepeat(3, "link resolving") + async def impl() -> None: + assert dl # This function is only reached when dl is not None + async with dl as (bar, sink): + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") + real_url = await self._resolve_link_target(export_url) - content = link_template_plain if self._link_file_use_plaintext else link_template_rich - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + content = link_template_plain if self._link_file_use_plaintext else link_template_rich + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() + + await impl() + + async def _resolve_link_target(self, export_url: str) -> str: + async with self.session.get(export_url, allow_redirects=False) as resp: + # No redirect means we were authenticated + if hdrs.LOCATION not in resp.headers: + return soupify(await resp.read()).select_one("a").get("href").strip() + + self._authenticate() + + async with self.session.get(export_url, allow_redirects=False) as resp: + # No redirect means we were authenticated + if hdrs.LOCATION not in resp.headers: + return soupify(await resp.read()).select_one("a").get("href").strip() + + raise CrawlError("resolve_link_target failed even after authenticating") async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky @@ -228,19 +295,29 @@ class KitIliasWebCrawler(HttpCrawler): if not dl: return - async with dl as (bar, sink): - page = IliasPage(await self._get_page(element.url), element.url, element) - real_element = page.get_child_elements()[0] + @_iorepeat(3, "video download") + async def impl() -> None: + assert dl # The function is only reached when dl is not None + async with dl as (bar, sink): + page = IliasPage(await self._get_page(element.url), element.url, element) + real_element = page.get_child_elements()[0] - await self._stream_from_url(real_element.url, sink, bar) + await self._stream_from_url(real_element.url, sink, bar) + + await impl() async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: dl = await self.download(element_path, mtime=element.mtime) if not dl: return - async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar) + @_iorepeat(3, "file download") + async def impl() -> None: + assert dl # The function is only reached when dl is not None + async with dl as (bar, sink): + await self._stream_from_url(element.url, sink, bar) + + await impl() async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async def try_stream() -> bool: From 44ecb2fbe77b9c5caa6096d9b4309034b8326ba3 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 10:44:04 +0200 Subject: [PATCH 184/524] Fix cleanup deleting crawler's base directory --- PFERD/output_dir.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index fef6914..02d5fe8 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -385,7 +385,10 @@ class OutputDirectory: self._report.add_file(info.path) async def cleanup(self) -> None: - await self._cleanup_dir(self._root, PurePath()) + if not self._root.exists(): + return + + await self._cleanup_dir(self._root, PurePath(), delete_self=False) async def _cleanup(self, path: Path, pure: PurePath) -> None: if path.is_dir(): @@ -393,15 +396,16 @@ class OutputDirectory: elif path.is_file(): await self._cleanup_file(path, pure) - async def _cleanup_dir(self, path: Path, pure: PurePath) -> None: + async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: for child in path.iterdir(): pure_child = pure / child.name await self._cleanup(child, pure_child) - try: - path.rmdir() - except OSError: - pass + if delete_self: + try: + path.rmdir() + except OSError: + pass async def _cleanup_file(self, path: Path, pure: PurePath) -> None: if self._report.is_marked(pure): From 6fe51e258f14d478b2585947e18868dd20a60f4c Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 10:44:18 +0200 Subject: [PATCH 185/524] Number rules starting at 1 --- PFERD/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 7b5745b..2604c43 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -323,7 +323,7 @@ class Transformer: def transform(self, path: PurePath) -> Optional[PurePath]: for i, (line, rule) in enumerate(self._rules): - log.explain(f"Testing rule {i}: {line}") + log.explain(f"Testing rule {i+1}: {line}") result = rule.transform(path) if isinstance(result, PurePath): From 729ff0a4c7ab8870326232cfb4cc5ef5533b06fd Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 10:44:59 +0200 Subject: [PATCH 186/524] Fix simple authenticator output --- PFERD/authenticators/simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/authenticators/simple.py b/PFERD/authenticators/simple.py index caa0002..bcbe69c 100644 --- a/PFERD/authenticators/simple.py +++ b/PFERD/authenticators/simple.py @@ -37,7 +37,7 @@ class SimpleAuthenticator(Authenticator): if self._username is None: self._username = await ainput("Username: ") else: - print(f"Username: {self.username}") + print(f"Username: {self._username}") if self._password is None: self._password = await agetpass("Password: ") From ec3767c545b674d68afd9fe812caaf4b67f192dd Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 10:52:02 +0200 Subject: [PATCH 187/524] Create crawler base dir at start of crawl --- PFERD/crawler.py | 1 + PFERD/output_dir.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 731cfb9..61f1868 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -269,6 +269,7 @@ class Crawler(ABC): """ with log.show_progress(): + self._output_dir.prepare() await self._run() await self._cleanup() diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 02d5fe8..f9a7c99 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -142,6 +142,14 @@ class OutputDirectory: self._report = Report() + def prepare(self) -> None: + log.explain_topic(f"Creating base directory at {str(self._root.absolute())!r}") + + try: + self._root.mkdir(parents=True, exist_ok=True) + except OSError: + raise OutputDirError("Failed to create base directory") + def register_reserved(self, path: PurePath) -> None: self._report.mark_reserved(path) @@ -385,9 +393,6 @@ class OutputDirectory: self._report.add_file(info.path) async def cleanup(self) -> None: - if not self._root.exists(): - return - await self._cleanup_dir(self._root, PurePath(), delete_self=False) async def _cleanup(self, path: Path, pure: PurePath) -> None: From c88f20859aa20f32b17e7f728195f31d6146bbfb Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 11:04:50 +0200 Subject: [PATCH 188/524] Explain config file dumping --- PFERD/__main__.py | 2 ++ PFERD/config.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 5cc6ef6..2578487 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -81,6 +81,8 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N def dump_config(args: argparse.Namespace, config: Config) -> None: + log.explain_topic("Dumping config") + try: if args.dump_config is True: config.dump() diff --git a/PFERD/config.py b/PFERD/config.py index 26a9eb6..e68db53 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -5,6 +5,8 @@ from configparser import ConfigParser, SectionProxy from pathlib import Path from typing import Any, List, NoReturn, Optional, Tuple +from rich.markup import escape + from .logging import log from .utils import prompt_yes_no @@ -122,10 +124,14 @@ class Config: May throw a ConfigDumpError. """ - if not path: + if path: + log.explain("Using custom path") + else: + log.explain("Using default path") path = self._default_path() - print(f"Dumping config to {path}") + log.explain(f"Dumping to {str(path.absolute())!r}") + log.print(f"[bold bright_cyan]Dumping[/] to {escape(repr(str(path.absolute())))}") try: path.parent.mkdir(parents=True, exist_ok=True) From 803e5628a22d49877ef1be6d8974901b91605c31 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 11:30:16 +0200 Subject: [PATCH 189/524] Clean up logging Paths are now (hopefully) logged consistently across all crawlers --- PFERD/config.py | 12 ++--- PFERD/crawler.py | 10 ++-- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 15 +++--- PFERD/http_crawler.py | 8 +-- PFERD/logging.py | 32 ++++++++++++ PFERD/output_dir.py | 22 ++++----- PFERD/transformer.py | 3 +- PFERD/utils.py | 49 +++++++++++-------- 8 files changed, 95 insertions(+), 56 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index e68db53..3c69fc7 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -8,7 +8,7 @@ from typing import Any, List, NoReturn, Optional, Tuple from rich.markup import escape from .logging import log -from .utils import prompt_yes_no +from .utils import fmt_real_path, prompt_yes_no class ConfigLoadError(Exception): @@ -17,7 +17,7 @@ class ConfigLoadError(Exception): """ def __init__(self, path: Path, reason: str): - super().__init__(f"Failed to load config from {path}") + super().__init__(f"Failed to load config from {fmt_real_path(path)}") self.path = path self.reason = reason @@ -36,7 +36,7 @@ class ConfigOptionError(Exception): class ConfigDumpError(Exception): def __init__(self, path: Path, reason: str): - super().__init__(f"Failed to dump config to {path}") + super().__init__(f"Failed to dump config to {fmt_real_path(path)}") self.path = path self.reason = reason @@ -105,7 +105,7 @@ class Config: else: log.explain("Using default path") path = Config._default_path() - log.explain(f"Loading {str(path)!r}") + log.explain(f"Loading {fmt_real_path(path)}") # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. @@ -130,8 +130,8 @@ class Config: log.explain("Using default path") path = self._default_path() - log.explain(f"Dumping to {str(path.absolute())!r}") - log.print(f"[bold bright_cyan]Dumping[/] to {escape(repr(str(path.absolute())))}") + log.explain(f"Dumping to {fmt_real_path(path)}") + log.print(f"[bold bright_cyan]Dumping[/] to {escape(fmt_real_path(path))}") try: path.parent.mkdir(parents=True, exist_ok=True) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 61f1868..53640e3 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -12,7 +12,7 @@ from .logging import ProgressBar, log from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload from .report import MarkConflictError, MarkDuplicateError from .transformer import Transformer -from .utils import ReusableAsyncContextManager +from .utils import ReusableAsyncContextManager, fmt_path class CrawlWarning(Exception): @@ -217,7 +217,7 @@ class Crawler(ABC): ) async def crawl(self, path: PurePath) -> Optional[CrawlToken]: - log.explain_topic(f"Decision: Crawl {path}") + log.explain_topic(f"Decision: Crawl {fmt_path(path)}") if self._transformer.transform(path) is None: log.explain("Answer: No") @@ -225,7 +225,7 @@ class Crawler(ABC): log.explain("Answer: Yes") - desc = f"[bold bright_cyan]Crawling[/] {escape(str(path))}" + desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(path))}" return CrawlToken(self._limiter, desc) async def download( @@ -235,7 +235,7 @@ class Crawler(ABC): redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, ) -> Optional[DownloadToken]: - log.explain_topic(f"Decision: Download {path}") + log.explain_topic(f"Decision: Download {fmt_path(path)}") transformed_path = self._transformer.transform(path) if transformed_path is None: @@ -249,7 +249,7 @@ class Crawler(ABC): log.explain("Answer: Yes") - desc = f"[bold bright_cyan]Downloading[/] {escape(str(path))}" + desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(path))}" return DownloadToken(self._limiter, fs_token, desc) async def _cleanup(self) -> None: diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 1bdf5e4..424b4ba 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -1,13 +1,11 @@ import asyncio import re from pathlib import PurePath -# TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union import aiohttp from aiohttp import hdrs from bs4 import BeautifulSoup, Tag -from rich.markup import escape from PFERD.authenticators import Authenticator from PFERD.config import Config @@ -17,6 +15,7 @@ from PFERD.logging import ProgressBar, log from PFERD.output_dir import FileSink, Redownload from PFERD.utils import soupify, url_set_query_param +from ...utils import fmt_path from .file_templates import link_template_plain, link_template_rich from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement @@ -86,10 +85,10 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: last_exception = e except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. last_exception = e - log.explain_topic(f"Retrying operation {escape(name)}. Retries left: {attempts - 1 - round}") + log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") if last_exception: - message = f"Error in I/O Operation: {escape(str(last_exception))}" + message = f"Error in I/O Operation: {last_exception}" raise CrawlWarning(message) from last_exception raise CrawlError("Impossible return in ilias _iorepeat") @@ -162,7 +161,7 @@ class KitIliasWebCrawler(HttpCrawler): log.explain_topic("Inferred crawl target: Personal desktop") await self._crawl_desktop() else: - log.explain_topic(f"Inferred crawl target: URL {escape(self._target)}") + log.explain_topic(f"Inferred crawl target: URL {self._target}") await self._crawl_url(self._target) async def _crawl_course(self, course_id: int) -> None: @@ -190,9 +189,7 @@ class KitIliasWebCrawler(HttpCrawler): if expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError( - "Invalid course id? I didn't find anything looking like a course" - ) + raise CrawlError("Invalid course id? Didn't find anything looking like a course") # Duplicated code, but the root page is special - we want to void fetching it twice! page = IliasPage(soup, url, None) @@ -236,7 +233,7 @@ class KitIliasWebCrawler(HttpCrawler): if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Decision: Crawl {escape(str(element_path))}") + log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Forums are not supported") log.explain("Answer: No") elif element.type == IliasElementType.LINK: diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py index b9cfeea..15e9ff1 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/http_crawler.py @@ -3,11 +3,11 @@ from pathlib import PurePath from typing import Optional import aiohttp -from rich.markup import escape from .config import Config from .crawler import Crawler, CrawlerSection from .logging import log +from .utils import fmt_real_path from .version import NAME, VERSION @@ -59,14 +59,14 @@ class HttpCrawler(Crawler): async def _save_cookies(self) -> None: log.explain_topic("Saving cookies") if not self._current_cookie_jar: - log.explain("No cookie jar - save aborted") + log.explain("No cookie jar, save aborted") return try: self._current_cookie_jar.save(self._cookie_jar_path) - log.explain(f"Cookies saved to {escape(str(self.COOKIE_FILE))}") + log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}") except Exception: - log.print(f"[bold red]Warning:[/] Failed to save cookies to {escape(str(self.COOKIE_FILE))}") + log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") async def run(self) -> None: self._current_cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/logging.py b/PFERD/logging.py index beb92c6..9eb2c7c 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -112,6 +112,10 @@ class Log: self.print(line) def print(self, text: str) -> None: + """ + Print a normal message. Allows markup. + """ + if self._progress_suspended: self._lines.append(text) else: @@ -120,12 +124,24 @@ class Log: # TODO Print errors (and warnings?) to stderr def warn(self, text: str) -> None: + """ + Print a warning message. Allows no markup. + """ + self.print(f"[bold bright_red]Warning[/] {escape(text)}") def error(self, text: str) -> None: + """ + Print an error message. Allows no markup. + """ + self.print(f"[bold bright_red]Error[/] [red]{escape(text)}") def error_contd(self, text: str) -> None: + """ + Print further lines of an error message. Allows no markup. + """ + self.print(f"[red]{escape(text)}") def unexpected_exception(self) -> None: @@ -153,18 +169,34 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new """.strip()) def explain_topic(self, text: str) -> None: + """ + Print a top-level explain text. Allows no markup. + """ + if self.output_explain: self.print(f"[cyan]{escape(text)}") def explain(self, text: str) -> None: + """ + Print an indented explain text. Allows no markup. + """ + if self.output_explain: self.print(f" {escape(text)}") def action(self, text: str) -> None: + """ + Print a status update while crawling. Allows markup. + """ + if self.output_action: self.print(text) def report(self, text: str) -> None: + """ + Print a report after crawling. Allows markup. + """ + if self.output_report: self.print(text) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index f9a7c99..1f83de6 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -14,7 +14,7 @@ from rich.markup import escape from .logging import log from .report import Report -from .utils import ReusableAsyncContextManager, prompt_yes_no +from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_LENGTH = 6 @@ -143,7 +143,7 @@ class OutputDirectory: self._report = Report() def prepare(self) -> None: - log.explain_topic(f"Creating base directory at {str(self._root.absolute())!r}") + log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") try: self._root.mkdir(parents=True, exist_ok=True) @@ -159,9 +159,9 @@ class OutputDirectory: """ if ".." in path.parts: - raise OutputDirError(f"Forbidden segment '..' in path {path}") + raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") if "." in path.parts: - raise OutputDirError(f"Forbidden segment '.' in path {path}") + raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") return self._root / path def _should_download( @@ -213,7 +213,7 @@ class OutputDirectory: ) -> bool: if on_conflict == OnConflict.PROMPT: async with log.exclusive_output(): - prompt = f"Replace {path} with remote file?" + prompt = f"Replace {fmt_path(path)} with remote file?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: return False @@ -232,7 +232,7 @@ class OutputDirectory: ) -> bool: if on_conflict == OnConflict.PROMPT: async with log.exclusive_output(): - prompt = f"Recursively delete {path} and replace with remote file?" + prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: return False @@ -252,7 +252,7 @@ class OutputDirectory: ) -> bool: if on_conflict == OnConflict.PROMPT: async with log.exclusive_output(): - prompt = f"Delete {parent} so remote file {path} can be downloaded?" + prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: return False @@ -271,7 +271,7 @@ class OutputDirectory: ) -> bool: if on_conflict == OnConflict.PROMPT: async with log.exclusive_output(): - prompt = f"Delete {path}?" + prompt = f"Delete {fmt_path(path)}?" return await prompt_yes_no(prompt, default=False) elif on_conflict == OnConflict.LOCAL_FIRST: return False @@ -386,10 +386,10 @@ class OutputDirectory: self._update_metadata(info) if changed: - log.action(f"[bold bright_yellow]Changed[/] {escape(str(info.path))}") + log.action(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}") self._report.change_file(info.path) else: - log.action(f"[bold bright_green]Added[/] {escape(str(info.path))}") + log.action(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}") self._report.add_file(info.path) async def cleanup(self) -> None: @@ -419,7 +419,7 @@ class OutputDirectory: if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() - log.action(f"[bold bright_magenta]Deleted[/] {escape(str(path))}") + log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(path))}") self._report.delete_file(pure) except OSError: pass diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 2604c43..9670d0e 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -10,6 +10,7 @@ from pathlib import PurePath from typing import Dict, Optional, Union from .logging import log +from .utils import fmt_path class Rule(ABC): @@ -327,7 +328,7 @@ class Transformer: result = rule.transform(path) if isinstance(result, PurePath): - log.explain(f"Match! Transformed to {result}") + log.explain(f"Match! Transformed to {fmt_path(result)}") return result elif result: # Exclamation mark log.explain("Match! Ignored") diff --git a/PFERD/utils.py b/PFERD/utils.py index 1d11565..397feda 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -4,6 +4,7 @@ import sys import threading from abc import ABC, abstractmethod from contextlib import AsyncExitStack +from pathlib import Path, PurePath from types import TracebackType from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit @@ -34,6 +35,30 @@ async def agetpass(prompt: str) -> str: return await in_daemon_thread(lambda: getpass.getpass(prompt)) +async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: + """ + Asks the user a yes/no question and returns their choice. + """ + + if default is True: + query += " [Y/n] " + elif default is False: + query += " [y/N] " + else: + query += " [y/n] " + + while True: + response = (await ainput(query)).strip().lower() + if response == "y": + return True + elif response == "n": + return False + elif response == "" and default is not None: + return default + + print("Please answer with 'y' or 'n'.") + + def soupify(data: bytes) -> bs4.BeautifulSoup: """ Parses HTML to a beautifulsoup object. @@ -66,28 +91,12 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str: return result -async def prompt_yes_no(query: str, default: Optional[bool]) -> bool: - """ - Asks the user a yes/no question and returns their choice. - """ +def fmt_path(path: PurePath) -> str: + return repr(str(path)) - if default is True: - query += " [Y/n] " - elif default is False: - query += " [y/N] " - else: - query += " [y/n] " - while True: - response = (await ainput(query)).strip().lower() - if response == "y": - return True - elif response == "n": - return False - elif response == "" and default is not None: - return default - - print("Please answer with 'y' or 'n'.") +def fmt_real_path(path: Path) -> str: + return repr(str(path.absolute())) class ReusableAsyncContextManager(ABC, Generic[T]): From 25e2abdb033f1029bb6a841cb1d45958313907f3 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 11:45:14 +0200 Subject: [PATCH 190/524] Improve transformer explain wording --- PFERD/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 9670d0e..5a20207 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -328,10 +328,10 @@ class Transformer: result = rule.transform(path) if isinstance(result, PurePath): - log.explain(f"Match! Transformed to {fmt_path(result)}") + log.explain(f"Match found, transformed path to {fmt_path(result)}") return result elif result: # Exclamation mark - log.explain("Match! Ignored") + log.explain("Match found, path ignored") return None else: continue From 33a81a5f5c6cfa8756989cd682a9745df5ce4978 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 11:55:34 +0200 Subject: [PATCH 191/524] Document authentication in HTTP crawler and rename prepare_request --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 4 ++-- PFERD/http_crawler.py | 21 ++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 424b4ba..cde8654 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -333,7 +333,7 @@ class KitIliasWebCrawler(HttpCrawler): sink.done() return True - auth_id = await self.prepare_request() + auth_id = await self._current_auth_id() if await try_stream(): return @@ -343,7 +343,7 @@ class KitIliasWebCrawler(HttpCrawler): raise CrawlError("File streaming failed after authenticate()") async def _get_page(self, url: str) -> BeautifulSoup: - auth_id = await self.prepare_request() + auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py index 15e9ff1..adbac5d 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/http_crawler.py @@ -28,20 +28,35 @@ class HttpCrawler(Crawler): self._authentication_lock = asyncio.Lock() self._current_cookie_jar: Optional[aiohttp.CookieJar] = None - async def prepare_request(self) -> int: + async def _current_auth_id(self) -> int: + """ + Returns the id for the current authentication, i.e. an identifier for the last + successful call to [authenticate]. + + This method must be called before any request that might authenticate is made, so the + HttpCrawler can properly track when [authenticate] can return early and when actual + authentication is necessary. + """ # We acquire the lock here to ensure we wait for any concurrent authenticate to finish. # This should reduce the amount of requests we make: If an authentication is in progress # all future requests wait for authentication to complete. async with self._authentication_lock: return self._authentication_id - async def authenticate(self, current_id: int) -> None: + async def authenticate(self, caller_auth_id: int) -> None: + """ + Starts the authentication process. The main work is offloaded to _authenticate, which + you should overwrite in a subclass if needed. This method should *NOT* be overwritten. + + The [caller_auth_id] should be the result of a [_current_auth_id] call made *before* + the request was made. This ensures that authentication is not performed needlessly. + """ async with self._authentication_lock: # Another thread successfully called authenticate in-between # We do not want to perform auth again, so we return here. We can # assume the other thread suceeded as authenticate will throw an error # if it failed and aborts the crawl process. - if current_id != self._authentication_id: + if caller_auth_id != self._authentication_id: return await self._authenticate() self._authentication_id += 1 From e81005ae4bccda785c19e7dd869e480ee487a5ee Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 11:57:59 +0200 Subject: [PATCH 192/524] Fix CLI arguments --- PFERD/cli/__init__.py | 131 ++--------------------- PFERD/cli/{local.py => command_local.py} | 2 +- PFERD/cli/parser.py | 125 +++++++++++++++++++++ 3 files changed, 134 insertions(+), 124 deletions(-) rename PFERD/cli/{local.py => command_local.py} (96%) create mode 100644 PFERD/cli/parser.py diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index 71d9732..2a9f124 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -1,125 +1,10 @@ -import argparse -import configparser -from pathlib import Path +# isort: skip_file -from ..output_dir import OnConflict, Redownload +# The order of imports matters because each command module registers itself +# with the parser from ".parser". Because of this, isort is disabled for this +# file. Also, since we're reexporting or just using the side effect of +# importing itself, we get a few linting warnings, which we're disabling as +# well. -CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) -CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( - title="general crawler arguments", - description="arguments common to all crawlers", -) -CRAWLER_PARSER_GROUP.add_argument( - "--redownload", - type=Redownload.from_string, - metavar="OPTION", - help="when to redownload a file that's already present locally" -) -CRAWLER_PARSER_GROUP.add_argument( - "--on-conflict", - type=OnConflict.from_string, - metavar="OPTION", - help="what to do when local and remote files or directories differ" -) -CRAWLER_PARSER_GROUP.add_argument( - "--transform", "-t", - action="append", - type=str, - metavar="RULE", - help="add a single transformation rule. Can be specified multiple times" -) -CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-tasks", - type=int, - metavar="N", - help="maximum number of concurrent tasks (crawling, downloading)" -) -CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-downloads", - type=int, - metavar="N", - help="maximum number of tasks that may download data at the same time" -) -CRAWLER_PARSER_GROUP.add_argument( - "--delay-between-tasks", - type=float, - metavar="SECONDS", - help="time the crawler should wait between subsequent tasks" -) - - -def load_crawler( - args: argparse.Namespace, - section: configparser.SectionProxy, -) -> None: - if args.redownload is not None: - section["redownload"] = args.redownload.value - if args.on_conflict is not None: - section["on_conflict"] = args.on_conflict.value - if args.transform is not None: - section["transform"] = "\n" + "\n".join(args.transform) - if args.max_concurrent_tasks is not None: - section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) - if args.max_concurrent_downloads is not None: - section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) - if args.delay_between_tasks is not None: - section["delay_between_tasks"] = str(args.delay_between_tasks) - - -PARSER = argparse.ArgumentParser() -PARSER.set_defaults(command=None) -PARSER.add_argument( - "--version", - action="store_true", - help="print version and exit" -) -PARSER.add_argument( - "--config", "-c", - type=Path, - metavar="PATH", - help="custom config file" -) -PARSER.add_argument( - "--dump-config", - nargs="?", - const=True, - metavar="PATH", - help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified" -) -PARSER.add_argument( - "--crawler", - action="append", - type=str, - metavar="NAME", - help="only execute a single crawler." - " Can be specified multiple times to execute multiple crawlers" -) -PARSER.add_argument( - "--working-dir", - type=Path, - metavar="PATH", - help="custom working directory" -) -PARSER.add_argument( - "--explain", "-e", - # TODO Use argparse.BooleanOptionalAction after updating to 3.9 - action="store_const", - const=True, - help="log and explain in detail what PFERD is doing" -) - - -def load_default_section( - args: argparse.Namespace, - parser: configparser.ConfigParser, -) -> None: - section = parser[parser.default_section] - - if args.working_dir is not None: - section["working_dir"] = str(args.working_dir) - if args.explain is not None: - section["explain"] = "true" if args.explain else "false" - - -SUBPARSERS = PARSER.add_subparsers(title="crawlers") +from . import command_local # noqa: F401 imported but unused +from .parser import PARSER, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/local.py b/PFERD/cli/command_local.py similarity index 96% rename from PFERD/cli/local.py rename to PFERD/cli/command_local.py index 5df81db..73f9d43 100644 --- a/PFERD/cli/local.py +++ b/PFERD/cli/command_local.py @@ -2,7 +2,7 @@ import argparse import configparser from pathlib import Path -from . import CRAWLER_PARSER, SUBPARSERS, load_crawler +from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler SUBPARSER = SUBPARSERS.add_parser( "local", diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py new file mode 100644 index 0000000..71d9732 --- /dev/null +++ b/PFERD/cli/parser.py @@ -0,0 +1,125 @@ +import argparse +import configparser +from pathlib import Path + +from ..output_dir import OnConflict, Redownload + +CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) +CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( + title="general crawler arguments", + description="arguments common to all crawlers", +) +CRAWLER_PARSER_GROUP.add_argument( + "--redownload", + type=Redownload.from_string, + metavar="OPTION", + help="when to redownload a file that's already present locally" +) +CRAWLER_PARSER_GROUP.add_argument( + "--on-conflict", + type=OnConflict.from_string, + metavar="OPTION", + help="what to do when local and remote files or directories differ" +) +CRAWLER_PARSER_GROUP.add_argument( + "--transform", "-t", + action="append", + type=str, + metavar="RULE", + help="add a single transformation rule. Can be specified multiple times" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-tasks", + type=int, + metavar="N", + help="maximum number of concurrent tasks (crawling, downloading)" +) +CRAWLER_PARSER_GROUP.add_argument( + "--max-concurrent-downloads", + type=int, + metavar="N", + help="maximum number of tasks that may download data at the same time" +) +CRAWLER_PARSER_GROUP.add_argument( + "--delay-between-tasks", + type=float, + metavar="SECONDS", + help="time the crawler should wait between subsequent tasks" +) + + +def load_crawler( + args: argparse.Namespace, + section: configparser.SectionProxy, +) -> None: + if args.redownload is not None: + section["redownload"] = args.redownload.value + if args.on_conflict is not None: + section["on_conflict"] = args.on_conflict.value + if args.transform is not None: + section["transform"] = "\n" + "\n".join(args.transform) + if args.max_concurrent_tasks is not None: + section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) + if args.max_concurrent_downloads is not None: + section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) + if args.delay_between_tasks is not None: + section["delay_between_tasks"] = str(args.delay_between_tasks) + + +PARSER = argparse.ArgumentParser() +PARSER.set_defaults(command=None) +PARSER.add_argument( + "--version", + action="store_true", + help="print version and exit" +) +PARSER.add_argument( + "--config", "-c", + type=Path, + metavar="PATH", + help="custom config file" +) +PARSER.add_argument( + "--dump-config", + nargs="?", + const=True, + metavar="PATH", + help="dump current configuration to a file and exit." + " Uses default config file path if no path is specified" +) +PARSER.add_argument( + "--crawler", + action="append", + type=str, + metavar="NAME", + help="only execute a single crawler." + " Can be specified multiple times to execute multiple crawlers" +) +PARSER.add_argument( + "--working-dir", + type=Path, + metavar="PATH", + help="custom working directory" +) +PARSER.add_argument( + "--explain", "-e", + # TODO Use argparse.BooleanOptionalAction after updating to 3.9 + action="store_const", + const=True, + help="log and explain in detail what PFERD is doing" +) + + +def load_default_section( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + section = parser[parser.default_section] + + if args.working_dir is not None: + section["working_dir"] = str(args.working_dir) + if args.explain is not None: + section["explain"] = "true" if args.explain else "false" + + +SUBPARSERS = PARSER.add_subparsers(title="crawlers") From 3d4b997d4a58bdb728a4f653ee0b0ffec2d662ff Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 12:24:10 +0200 Subject: [PATCH 193/524] Retry crawl_url and work around Python's closure handling Closures capture the scope and not the variables. Therefore, any type-narrowing performed by mypy on captured variables is lost inside the closure. --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 72 +++++++++++-------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index cde8654..2f27683 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -109,6 +109,7 @@ def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: # | # crawl_course --+ # | +# @_io_repeat | # retries internally (before the bar) # +- crawl_url <-+ # | # | @@ -176,36 +177,45 @@ class KitIliasWebCrawler(HttpCrawler): await self._crawl_url(self._base_url) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: - cl = await self.crawl(PurePath(".")) - if not cl: + maybe_cl = await self.crawl(PurePath(".")) + if not maybe_cl: return + cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 - tasks = [] + @_iorepeat(3, "crawling url") + async def impl() -> None: + tasks = [] - # TODO: Retry this when the crawl and download bar are reworked - async with cl: - soup = await self._get_page(url) + async with cl: + soup = await self._get_page(url) - if expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError("Invalid course id? Didn't find anything looking like a course") + if expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + raise CrawlError("Invalid course id? Didn't find anything looking like a course") - # Duplicated code, but the root page is special - we want to void fetching it twice! - page = IliasPage(soup, url, None) - for child in page.get_child_elements(): - tasks.append(self._handle_ilias_element(PurePath("."), child)) + # Duplicated code, but the root page is special - we want to void fetching it twice! + page = IliasPage(soup, url, None) + for child in page.get_child_elements(): + tasks.append(self._handle_ilias_element(PurePath("."), child)) - await asyncio.gather(*tasks) + # The only point an I/O exception can be thrown is in `get_page`. + # If that happens, no task was spawned yet. Therefore, we only retry + # this method without having spawned a single task. Due to this we do + # not need to cancel anything or worry about this gather call or the forks + # further up. + await asyncio.gather(*tasks) + + await impl() async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: - cl = await self.crawl(path) - if not cl: + maybe_cl = await self.crawl(path) + if not maybe_cl: return + cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 @_iorepeat(3, "crawling folder") async def impl() -> None: - assert cl # The function is only reached when cl is not None tasks = [] async with cl: soup = await self._get_page(url) @@ -225,8 +235,8 @@ class KitIliasWebCrawler(HttpCrawler): @anoncritical # Shouldn't happen but this method must never raise an I/O error as that might interfere with - # handle_ilias_page - @_wrap_io_in_warning("ilias element handling") + # handle_ilias_page or crawl_url + @_wrap_io_in_warning("handling ilias element") async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) @@ -250,13 +260,13 @@ class KitIliasWebCrawler(HttpCrawler): raise CrawlWarning(f"Unknown element type: {element.type!r}") async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: - dl = await self.download(element_path, mtime=element.mtime) - if not dl: + maybe_dl = await self.download(element_path, mtime=element.mtime) + if not maybe_dl: return + dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 - @_iorepeat(3, "link resolving") + @_iorepeat(3, "resolving link") async def impl() -> None: - assert dl # This function is only reached when dl is not None async with dl as (bar, sink): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) @@ -288,11 +298,12 @@ class KitIliasWebCrawler(HttpCrawler): async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky - dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) - if not dl: + maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) + if not maybe_dl: return + dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 - @_iorepeat(3, "video download") + @_iorepeat(3, "downloading video") async def impl() -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): @@ -304,11 +315,12 @@ class KitIliasWebCrawler(HttpCrawler): await impl() async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: - dl = await self.download(element_path, mtime=element.mtime) - if not dl: + maybe_dl = await self.download(element_path, mtime=element.mtime) + if not maybe_dl: return + dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 - @_iorepeat(3, "file download") + @_iorepeat(3, "downloading file") async def impl() -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): From ecdedfa1cfa2bfac01a4b1f96046eaec146eb9fe Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 12:36:09 +0200 Subject: [PATCH 194/524] Add no-videos flag to ILIAS crawler --- CONFIG.md | 1 + PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 29fc7e2..e92858f 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -136,6 +136,7 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional HTML page instead. +- `no-videos`: If this is set to true, PFERD will not crawl or download any videos. ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 2f27683..f69d769 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -57,6 +57,9 @@ class KitIliasWebCrawlerSection(CrawlerSection): def link_file_use_plaintext(self) -> bool: return self.s.getboolean("link_file_plain_text", fallback=False) + def no_videos(self) -> bool: + return self.s.getboolean("no-videos", fallback=True) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -66,6 +69,13 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) +_VIDEO_ELEMENTS: Set[IliasElementType] = set([ + IliasElementType.VIDEO, + IliasElementType.VIDEO_PLAYER, + IliasElementType.VIDEO_FOLDER, + IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, +]) + AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) @@ -153,6 +163,7 @@ class KitIliasWebCrawler(HttpCrawler): self._target = section.target() self._link_file_redirect_delay = section.link_file_redirect_delay() self._link_file_use_plaintext = section.link_file_use_plaintext() + self._no_videos = section.no_videos() async def _run(self) -> None: if isinstance(self._target, int): @@ -240,6 +251,16 @@ class KitIliasWebCrawler(HttpCrawler): async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) + if element.type in _VIDEO_ELEMENTS: + log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") + if self._no_videos: + log.explain("Video crawling is disabled") + log.explain("Answer: no") + return + else: + log.explain("Video crawling is enabled") + log.explain("Answer: yes") + if element.type == IliasElementType.FILE: await self._download_file(element, element_path) elif element.type == IliasElementType.FORUM: From 7e0bb0625924ea27fba57687b40e70d32d335c21 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 12:47:30 +0200 Subject: [PATCH 195/524] Clean up TODOs --- PFERD/crawlers/ilias/kit_ilias_html.py | 1 - PFERD/logging.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_html.py b/PFERD/crawlers/ilias/kit_ilias_html.py index 17eb855..eed0884 100644 --- a/PFERD/crawlers/ilias/kit_ilias_html.py +++ b/PFERD/crawlers/ilias/kit_ilias_html.py @@ -3,7 +3,6 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -# TODO In Python 3.9 and above, AsyncContextManager is deprecated from typing import List, Optional, Union from urllib.parse import urljoin, urlparse diff --git a/PFERD/logging.py b/PFERD/logging.py index 9eb2c7c..ef6ee4c 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -2,7 +2,7 @@ import asyncio import sys import traceback from contextlib import asynccontextmanager, contextmanager -# TODO In Python 3.9 and above, ContextManager and AsyncContextManager are deprecated +# TODO In Python 3.9 and above, ContextManager is deprecated from typing import AsyncIterator, ContextManager, Iterator, List, Optional from rich.console import Console, RenderGroup From b44b49476d36ef93bcaf4ab836fba30b0e56e3e6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 13:23:28 +0200 Subject: [PATCH 196/524] Fix noncritical and anoncritical decorators I must've forgot to update the anoncritical decorator when I last changed the noncritical decorator. Also, every exception should make the crawler not error_free, not just CrawlErrors. --- PFERD/crawler.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 53640e3..c1184c0 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -28,8 +28,11 @@ Wrapped = TypeVar("Wrapped", bound=Callable[..., None]) def noncritical(f: Wrapped) -> Wrapped: """ - Catches all exceptions occuring during the function call. If an exception - occurs, the crawler's error_free variable is set to False. + Catches and logs a few noncritical exceptions occurring during the function + call, mainly CrawlWarning. + + If any exception occurs during the function call, the crawler's error_free + variable is set to False. This includes noncritical exceptions. Warning: Must only be applied to member functions of the Crawler class! """ @@ -45,7 +48,7 @@ def noncritical(f: Wrapped) -> Wrapped: except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: log.warn(str(e)) crawler.error_free = False - except CrawlError: + except: # noqa: E722 do not use bare 'except' crawler.error_free = False raise @@ -59,8 +62,11 @@ def anoncritical(f: AWrapped) -> AWrapped: """ An async version of @noncritical. - Catches all exceptions occuring during the function call. If an exception - occurs, the crawler's error_free variable is set to False. + Catches and logs a few noncritical exceptions occurring during the function + call, mainly CrawlWarning. + + If any exception occurs during the function call, the crawler's error_free + variable is set to False. This includes noncritical exceptions. Warning: Must only be applied to member functions of the Crawler class! """ @@ -73,11 +79,10 @@ def anoncritical(f: AWrapped) -> AWrapped: try: await f(*args, **kwargs) - except CrawlWarning as e: - log.print(f"[bold bright_red]Warning[/] {escape(str(e))}") + except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: + log.warn(str(e)) crawler.error_free = False - except CrawlError as e: - log.print(f"[bold bright_red]Error[/] [red]{escape(str(e))}") + except: # noqa: E722 do not use bare 'except' crawler.error_free = False raise From e1d18708b3cf6a93e0b1f2ea1f94a657656e6db2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 13:26:23 +0200 Subject: [PATCH 197/524] Rename "no_videos" to videos --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index f69d769..9094a7b 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -55,10 +55,10 @@ class KitIliasWebCrawlerSection(CrawlerSection): return self.s.getint("link_file_redirect_delay", fallback=-1) def link_file_use_plaintext(self) -> bool: - return self.s.getboolean("link_file_plain_text", fallback=False) + return self.s.getboolean("link_file_plaintext", fallback=False) - def no_videos(self) -> bool: - return self.s.getboolean("no-videos", fallback=True) + def videos(self) -> bool: + return self.s.getboolean("videos", fallback=False) _DIRECTORY_PAGES: Set[IliasElementType] = set([ @@ -163,7 +163,7 @@ class KitIliasWebCrawler(HttpCrawler): self._target = section.target() self._link_file_redirect_delay = section.link_file_redirect_delay() self._link_file_use_plaintext = section.link_file_use_plaintext() - self._no_videos = section.no_videos() + self._videos = section.videos() async def _run(self) -> None: if isinstance(self._target, int): @@ -253,7 +253,7 @@ class KitIliasWebCrawler(HttpCrawler): if element.type in _VIDEO_ELEMENTS: log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") - if self._no_videos: + if not self._videos: log.explain("Video crawling is disabled") log.explain("Answer: no") return From d8f26a789ed79b680f0a76d516cd9babdcc22c0c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 13:26:40 +0200 Subject: [PATCH 198/524] Implement CLI Command for ilias crawler --- PFERD/cli/__init__.py | 1 + PFERD/cli/command_kit_ilias_web.py | 83 ++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 PFERD/cli/command_kit_ilias_web.py diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index 2a9f124..f9cb5d2 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -7,4 +7,5 @@ # well. from . import command_local # noqa: F401 imported but unused +from . import command_kit_ilias_web # noqa: F401 imported but unused from .parser import PARSER, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py new file mode 100644 index 0000000..c743a51 --- /dev/null +++ b/PFERD/cli/command_kit_ilias_web.py @@ -0,0 +1,83 @@ +import argparse +import configparser +from pathlib import Path + +from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler + +SUBPARSER = SUBPARSERS.add_parser( + "kit-ilias-web", + parents=[CRAWLER_PARSER], +) + +GROUP = SUBPARSER.add_argument_group( + title="KIT ILIAS web-crawler arguments", + description="arguments for the 'kit-ilias-web' crawler", +) +GROUP.add_argument( + "target", + type=str, + metavar="TARGET", + help="course id, 'desktop', or ILIAS https-URL to crawl" +) +GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) +GROUP.add_argument( + "--videos", + # TODO Use argparse.BooleanOptionalAction after updating to 3.9 + action="store_const", + const=True, + help="crawl and download videos" +) +GROUP.add_argument( + "--username", + type=str, + metavar="USER_NAME", + help="user name for authentication" +) +GROUP.add_argument( + "--link-file-redirect-delay", + type=int, + metavar="SECONDS", + help="delay before external link files redirect you to their target (-1 to disable)" +) +GROUP.add_argument( + "--link-file-plaintext", + # TODO Use argparse.BooleanOptionalAction after updating to 3.9 + action="store_const", + const=True, + help="use plain text files for external links" +) + + +def load( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + parser["crawl:kit-ilias-web"] = {} + section = parser["crawl:kit-ilias-web"] + load_crawler(args, section) + + section["type"] = "kit-ilias-web" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + section["auth"] = "auth:kit-ilias-web" + if args.link_file_redirect_delay is not None: + section["link_file_redirect_delay"] = str(args.link_file_redirect_delay) + if args.link_file_plaintext is not None: + section["link_file_plaintext"] = str(args.link_file_plaintext) + if args.videos is not None: + section["videos"] = str(False) + + parser["auth:kit-ilias-web"] = {} + auth_section = parser["auth:kit-ilias-web"] + auth_section["type"] = "simple" + + if args.username is not None: + auth_section["username"] = str(args.username) + + +SUBPARSER.set_defaults(command=load) From 245c9c3dcc11d03395432efc831f2a42d2c28214 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 16:22:14 +0200 Subject: [PATCH 199/524] Explain output dir decisions and steps --- PFERD/output_dir.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 1f83de6..c81b598 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -174,9 +174,12 @@ class OutputDirectory: # since we know that the remote is different from the local files. This # includes the case where no local file exists. if not local_path.is_file(): + log.explain("No corresponding file present locally") # TODO Don't download if on_conflict is LOCAL_FIRST or NO_DELETE return True + log.explain(f"Redownload policy is {redownload.value}") + if redownload == Redownload.NEVER: return False elif redownload == Redownload.ALWAYS: @@ -187,6 +190,10 @@ class OutputDirectory: remote_newer = None if mtime := heuristics.mtime: remote_newer = mtime.timestamp() > stat.st_mtime + if remote_newer: + log.explain("Remote file seems to be newer") + else: + log.explain("Local file seems to be newer") if redownload == Redownload.NEVER_SMART: if remote_newer is None: @@ -332,19 +339,25 @@ class OutputDirectory: # Detect and solve local-dir-remote-file conflict if local_path.is_dir(): + log.explain("Conflict: There's a dir in place of the local file") if await self._conflict_ldrf(on_conflict, path): + log.explain("Result: Delete the dir") shutil.rmtree(local_path) else: + log.explain("Result: Keep the dir") return None # Detect and solve local-file-remote-dir conflict for parent in path.parents: local_parent = self.resolve(parent) if local_parent.exists() and not local_parent.is_dir(): + log.explain("Conflict: One of the local file's parents is a file") if await self._conflict_lfrd(on_conflict, path, parent): + log.explain("Result: Delete the obstructing file") local_parent.unlink() break else: + log.explain("Result: Keep the obstructing file") return None # Ensure parent directory exists @@ -366,9 +379,12 @@ class OutputDirectory: async def _after_download(self, info: DownloadInfo) -> None: with self._ensure_deleted(info.tmp_path): + log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") + changed = False if not info.success: + log.explain("Download unsuccessful, aborting") return # Solve conflicts arising from existing local file @@ -376,13 +392,21 @@ class OutputDirectory: changed = True if filecmp.cmp(info.local_path, info.tmp_path): + log.explain("Contents identical with existing file") + log.explain("Updating metadata on existing file instead") self._update_metadata(info) return - if not await self._conflict_lfrf(info.on_conflict, info.path): + log.explain("Conflict: The local and remote versions differ") + if await self._conflict_lfrf(info.on_conflict, info.path): + log.explain("Result: Keeping the remote version") + else: + log.explain("Result: Keeping the local version") return + log.explain("Replacing local file with temporary file") info.tmp_path.replace(info.local_path) + log.explain("Updating metadata on local file") self._update_metadata(info) if changed: From b998339002ee93d69f6a1acb8372abcf9ab6c1e1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 16:22:38 +0200 Subject: [PATCH 200/524] Fix cleanup logging of paths --- PFERD/output_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c81b598..bf908f8 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -443,7 +443,7 @@ class OutputDirectory: if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() - log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(path))}") + log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(pure))}") self._report.delete_file(pure) except OSError: pass From c0cecf8363eb296dad5a04e2c2685d4f5a2080b2 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 16:22:58 +0200 Subject: [PATCH 201/524] Log crawl and download actions more extensively --- PFERD/crawler.py | 29 ++++++++++++++++------------- PFERD/output_dir.py | 8 +++++++- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index c1184c0..749510c 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -90,31 +90,38 @@ def anoncritical(f: AWrapped) -> AWrapped: class CrawlToken(ReusableAsyncContextManager[ProgressBar]): - def __init__(self, limiter: Limiter, desc: str): + def __init__(self, limiter: Limiter, path: PurePath): super().__init__() self._limiter = limiter - self._desc = desc + self._path = path async def _on_aenter(self) -> ProgressBar: + bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" + after_desc = f"[bold bright_cyan]Crawled[/] {escape(fmt_path(self._path))}" + + self._stack.callback(lambda: log.action(after_desc)) await self._stack.enter_async_context(self._limiter.limit_crawl()) - bar = self._stack.enter_context(log.crawl_bar(self._desc)) + bar = self._stack.enter_context(log.crawl_bar(bar_desc)) return bar class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): - def __init__(self, limiter: Limiter, fs_token: FileSinkToken, desc: str): + def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath): super().__init__() self._limiter = limiter self._fs_token = fs_token - self._desc = desc + self._path = path async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: + bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" + # The "Downloaded ..." message is printed in the output dir, not here + await self._stack.enter_async_context(self._limiter.limit_crawl()) sink = await self._stack.enter_async_context(self._fs_token) - bar = self._stack.enter_context(log.crawl_bar(self._desc)) + bar = self._stack.enter_context(log.crawl_bar(bar_desc)) return bar, sink @@ -229,9 +236,7 @@ class Crawler(ABC): return None log.explain("Answer: Yes") - - desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(path))}" - return CrawlToken(self._limiter, desc) + return CrawlToken(self._limiter, path) async def download( self, @@ -247,15 +252,13 @@ class Crawler(ABC): log.explain("Answer: No") return None - fs_token = await self._output_dir.download(transformed_path, mtime, redownload, on_conflict) + fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) if fs_token is None: log.explain("Answer: No") return None log.explain("Answer: Yes") - - desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(path))}" - return DownloadToken(self._limiter, fs_token, desc) + return DownloadToken(self._limiter, fs_token, path) async def _cleanup(self) -> None: log.explain_topic("Decision: Clean up files") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index bf908f8..b07fe3e 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -78,6 +78,7 @@ class FileSink: @dataclass class DownloadInfo: + remote_path: PurePath path: PurePath local_path: Path tmp_path: Path @@ -96,6 +97,7 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]): def __init__( self, output_dir: "OutputDirectory", + remote_path: PurePath, path: PurePath, local_path: Path, heuristics: Heuristics, @@ -104,6 +106,7 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]): super().__init__() self._output_dir = output_dir + self._remote_path = remote_path self._path = path self._local_path = local_path self._heuristics = heuristics @@ -115,6 +118,7 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]): async def after_download() -> None: await self._output_dir._after_download(DownloadInfo( + self._remote_path, self._path, self._local_path, tmp_path, @@ -317,6 +321,7 @@ class OutputDirectory: async def download( self, + remote_path: PurePath, path: PurePath, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, @@ -363,7 +368,7 @@ class OutputDirectory: # Ensure parent directory exists local_path.parent.mkdir(parents=True, exist_ok=True) - return FileSinkToken(self, path, local_path, heuristics, on_conflict) + return FileSinkToken(self, remote_path, path, local_path, heuristics, on_conflict) def _update_metadata(self, info: DownloadInfo) -> None: if mtime := info.heuristics.mtime: @@ -379,6 +384,7 @@ class OutputDirectory: async def _after_download(self, info: DownloadInfo) -> None: with self._ensure_deleted(info.tmp_path): + log.action(f"[bold bright_cyan]Downloaded[/] {fmt_path(info.remote_path)}") log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") changed = False From 29d5a40c570ac21b7bd73fee64134e6c79216301 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 17:25:16 +0200 Subject: [PATCH 202/524] Replace asyncio.gather with custom Crawler function --- PFERD/crawler.py | 22 ++++++++++++++++++- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 5 ++--- PFERD/crawlers/local.py | 2 +- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 749510c..e3aef8f 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -1,7 +1,8 @@ +import asyncio from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar +from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar from rich.markup import escape @@ -228,6 +229,25 @@ class Crawler(ABC): section.on_conflict(), ) + @staticmethod + async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: + """ + Similar to asyncio.gather. However, in the case of an exception, all + still running tasks are cancelled and the exception is rethrown. + + This should always be preferred over asyncio.gather in crawler code so + that an exception like CrawlError may actually stop the crawler. + """ + + tasks = [asyncio.ensure_future(aw) for aw in awaitables] + result = asyncio.gather(*tasks) + try: + return await result + except: # noqa: E722 + for task in tasks: + task.cancel() + raise + async def crawl(self, path: PurePath) -> Optional[CrawlToken]: log.explain_topic(f"Decision: Crawl {fmt_path(path)}") diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 9094a7b..597ea17 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -1,4 +1,3 @@ -import asyncio import re from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union @@ -215,7 +214,7 @@ class KitIliasWebCrawler(HttpCrawler): # this method without having spawned a single task. Due to this we do # not need to cancel anything or worry about this gather call or the forks # further up. - await asyncio.gather(*tasks) + await self.gather(tasks) await impl() @@ -240,7 +239,7 @@ class KitIliasWebCrawler(HttpCrawler): # this method without having spawned a single task. Due to this we do # not need to cancel anything or worry about this gather call or the forks # further up. - await asyncio.gather(*tasks) + await self.gather(tasks) await impl() diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 176f36d..35e5829 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -83,7 +83,7 @@ class LocalCrawler(Crawler): pure_child = pure / child.name tasks.append(self._crawl_path(child, pure_child)) - await asyncio.gather(*tasks) + await self.gather(tasks) async def _crawl_file(self, path: Path, pure: PurePath) -> None: stat = path.stat() From 05ad06fbc11237582782ca68e587f55b683f2493 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 17:24:05 +0200 Subject: [PATCH 203/524] Only enclose get_page in iorepeat in ILIAS crawler We previously also gathered in there, which could lead to some more surprises when the method was retried. --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 597ea17..1019d3e 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -1,6 +1,6 @@ import re from pathlib import PurePath -from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union +from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union import aiohttp from aiohttp import hdrs @@ -192,10 +192,11 @@ class KitIliasWebCrawler(HttpCrawler): return cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 - @_iorepeat(3, "crawling url") - async def impl() -> None: - tasks = [] + elements: List[IliasPageElement] = [] + @_iorepeat(3, "crawling url") + async def gather_elements() -> None: + elements.clear() async with cl: soup = await self._get_page(url) @@ -204,19 +205,16 @@ class KitIliasWebCrawler(HttpCrawler): if not perma_link_element or "crs_" not in perma_link_element.get("value"): raise CrawlError("Invalid course id? Didn't find anything looking like a course") - # Duplicated code, but the root page is special - we want to void fetching it twice! + # Duplicated code, but the root page is special - we want to avoid fetching it twice! page = IliasPage(soup, url, None) - for child in page.get_child_elements(): - tasks.append(self._handle_ilias_element(PurePath("."), child)) + elements.extend(page.get_child_elements()) - # The only point an I/O exception can be thrown is in `get_page`. - # If that happens, no task was spawned yet. Therefore, we only retry - # this method without having spawned a single task. Due to this we do - # not need to cancel anything or worry about this gather call or the forks - # further up. - await self.gather(tasks) + # Fill up our task list with the found elements + await gather_elements() + tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] - await impl() + # And execute them + await self.gather(tasks) async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: maybe_cl = await self.crawl(path) @@ -224,28 +222,27 @@ class KitIliasWebCrawler(HttpCrawler): return cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + elements: List[IliasPageElement] = [] + @_iorepeat(3, "crawling folder") - async def impl() -> None: - tasks = [] + async def gather_elements() -> None: + elements.clear() async with cl: soup = await self._get_page(url) page = IliasPage(soup, url, parent) - for child in page.get_child_elements(): - tasks.append(self._handle_ilias_element(path, child)) + elements.extend(page.get_child_elements()) - # The only point an I/O exception can be thrown is in `get_page`. - # If that happens, no task was spawned yet. Therefore, we only retry - # this method without having spawned a single task. Due to this we do - # not need to cancel anything or worry about this gather call or the forks - # further up. - await self.gather(tasks) + # Fill up our task list with the found elements + await gather_elements() + tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] - await impl() + # And execute them + await self.gather(tasks) @anoncritical - # Shouldn't happen but this method must never raise an I/O error as that might interfere with - # handle_ilias_page or crawl_url + # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical. + # If that happens we will be terminated as anoncritical doesn't tream them as non-critical. @_wrap_io_in_warning("handling ilias element") async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: element_path = PurePath(parent_path, element.name) From 463f8830d736e7b510c282f125eef65533d4b804 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 18:12:34 +0200 Subject: [PATCH 204/524] Add warn_contd --- PFERD/logging.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PFERD/logging.py b/PFERD/logging.py index ef6ee4c..e97a3fa 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -130,6 +130,13 @@ class Log: self.print(f"[bold bright_red]Warning[/] {escape(text)}") + def warn_contd(self, text: str) -> None: + """ + Print further lines of a warning message. Allows no markup. + """ + + self.print(f"{escape(text)}") + def error(self, text: str) -> None: """ Print an error message. Allows no markup. From 59f13bb8d6f2616705fb5d0e3db7e80a46560b61 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 18:12:51 +0200 Subject: [PATCH 205/524] Explain ILIAS HTML parsing and add some warnings --- PFERD/crawlers/ilias/kit_ilias_html.py | 58 +++++++++++-------- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 2 + 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_html.py b/PFERD/crawlers/ilias/kit_ilias_html.py index eed0884..cc02801 100644 --- a/PFERD/crawlers/ilias/kit_ilias_html.py +++ b/PFERD/crawlers/ilias/kit_ilias_html.py @@ -8,6 +8,7 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag +from PFERD.logging import log from PFERD.utils import url_set_query_params TargetType = Union[str, int] @@ -48,11 +49,15 @@ class IliasPage: Return all child page elements you can find here. """ if self._is_video_player(): + log.explain("Page is a video player, extracting URL") return self._player_to_video() if self._is_video_listing(): + log.explain("Page is a video listing, finding elements") return self._find_video_entries() if self._is_exercise_file(): + log.explain("Page is an exercise, finding elements") return self._find_exercise_entries() + log.explain("Page is a normal folder, finding elements") return self._find_normal_entries() def _is_video_player(self) -> bool: @@ -96,7 +101,7 @@ class IliasPage: json_match = regex.search(str(self._soup)) if json_match is None: - print(f"Could not find json stream info for {self._page_url!r}") + log.warn("Could not find JSON stream info in video player. Ignoring video.") return [] json_str = json_match.group(1) @@ -125,6 +130,7 @@ class IliasPage: url: str = self._abs_url_from_link(content_link) query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) + log.explain("Found ILIAS redirection page, following it as a new entry") return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None @@ -139,20 +145,12 @@ class IliasPage: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: - # TODO: Properly log this - print( - "Could not increase elements per page (table not found)." - " Some might not be crawled!" - ) + log.warn("Couldn't increase elements per page (table not found). I might miss elements.") return self._find_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: - # TODO: Properly log this - print( - "Could not increase elements per page (table id not found)." - " Some might not be crawled!" - ) + log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") return self._find_video_entries_no_paging() table_id = id_match.group(1) @@ -160,6 +158,8 @@ class IliasPage: query_params = {f"tbl_xoct_{table_id}_trows": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(self._page_url, query_params) + + log.explain("Disabled pagination, retrying folder as a new entry") return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] def _find_video_entries_no_paging(self) -> List[IliasPageElement]: @@ -173,7 +173,6 @@ class IliasPage: results: List[IliasPageElement] = [] - # TODO: Sadly the download button is currently broken, so never do that for link in video_links: results.append(self._listed_video_to_element(link)) @@ -194,6 +193,7 @@ class IliasPage: video_url = self._abs_url_from_link(link) + log.explain(f"Found video {video_name!r} at {video_url!r}") return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) def _find_exercise_entries(self) -> List[IliasPageElement]: @@ -213,6 +213,8 @@ class IliasPage: text="Download" ) + log.explain(f"Found exercise container {container_name!r}") + # Grab each file as you now have the link for file_link in files: # Two divs, side by side. Left is the name, right is the link ==> get left @@ -221,6 +223,7 @@ class IliasPage: file_name = _sanitize_path_name(file_name) url = self._abs_url_from_link(file_link) + log.explain(f"Found exercise entry {file_name!r}") results.append(IliasPageElement( IliasElementType.FILE, url, @@ -245,11 +248,14 @@ class IliasPage: if not element_type: continue if element_type == IliasElementType.MEETING: - element_name = _sanitize_path_name(self._normalize_meeting_name(element_name)) + normalized = _sanitize_path_name(self._normalize_meeting_name(element_name)) + log.explain(f"Normalized meeting name from {element_name!r} to {normalized!r}") + element_name = normalized elif element_type == IliasElementType.FILE: result.append(self._file_to_element(element_name, abs_url, link)) continue + log.explain(f"Found {element_name!r}") result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) return result @@ -282,8 +288,8 @@ class IliasPage: ) if modification_date_match is None: modification_date = None - # TODO: Properly log this - print(f"Could not extract start date from {all_properties_text!r}") + # TODO: Figure out if this is expected or *always* an error. + log.explain(f"Element {name} at {url} has no date. Properties: {all_properties_text!r}") else: modification_date_str = modification_date_match.group(1) modification_date = demangle_date(modification_date_str) @@ -292,6 +298,7 @@ class IliasPage: name = _sanitize_path_name(link_element.getText()) full_path = name + "." + file_type + log.explain(f"Found file {full_path!r}") return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) @staticmethod @@ -318,8 +325,10 @@ class IliasPage: if "ref_id=" in parsed_url.query: return IliasPage._find_type_from_folder_like(link_element, url) - # TODO: Log this properly - print(f"Unknown type: The element was at {str(element_name)!r} and it is {link_element!r})") + _unexpected_html_warning() + log.warn_contd( + f"Tried to figure out element type, but failed for {str(element_name)!r} / {link_element!r})" + ) return None @staticmethod @@ -339,16 +348,16 @@ class IliasPage: break if found_parent is None: - # TODO: Log this properly - print(f"Could not find element icon for {url!r}") + _unexpected_html_warning() + log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url!r}") return None # Find the small descriptive icon to figure out the type img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") if img_tag is None: - # TODO: Log this properly - print(f"Could not find image tag for {url!r}") + _unexpected_html_warning() + log.warn_contd(f"Tried to figure out element type, but did not find an image for {url!r}") return None if "opencast" in str(img_tag["alt"]).lower(): @@ -393,6 +402,10 @@ class IliasPage: return urljoin(self._page_url, link_tag.get("href")) +def _unexpected_html_warning() -> None: + log.warn("Encountered unexpected HTML structure, ignoring element.") + + german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', 'Okt', 'Nov', 'Dez'] english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] @@ -429,8 +442,7 @@ def demangle_date(date_str: str) -> Optional[datetime]: return datetime(year, month, day, hour, minute) except Exception: - # TODO: Properly log this - print(f"Could not parse date {date_str!r}") + log.warn(f"Date parsing failed for {date_str!r}") return None diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 1019d3e..1a4a529 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -229,6 +229,8 @@ class KitIliasWebCrawler(HttpCrawler): elements.clear() async with cl: soup = await self._get_page(url) + log.explain_topic(f"Parsing HTML page for {path!r}") + log.explain(f"URL: {url!r}") page = IliasPage(soup, url, parent) elements.extend(page.get_child_elements()) From a9af56a5e9b5752cf7ba1180ac36c5cdf6605316 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 18:16:25 +0200 Subject: [PATCH 206/524] Improve specifying crawlers via CLI Instead of removing the sections of unselected crawlers from the config file, crawler selection now happens in the Pferd after loading the crawlers and is more sophisticated. It also has better error messages. --- PFERD/__main__.py | 24 +++-------------- PFERD/pferd.py | 68 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 28 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 2578487..9bc2974 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -6,7 +6,7 @@ from pathlib import Path from .cli import PARSER, load_default_section from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log -from .pferd import Pferd +from .pferd import Pferd, PferdLoadError from .transformer import RuleParseError from .version import NAME, VERSION @@ -24,28 +24,10 @@ def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: args.command(args, parser) load_default_section(args, parser) - prune_crawlers(args, parser) return parser -def prune_crawlers( - args: argparse.Namespace, - parser: configparser.ConfigParser, -) -> None: - if not args.crawler: - return - - for section in parser.sections(): - if section.startswith("crawl:"): - # TODO Use removeprefix() when switching to 3.9 - name = section[len("crawl:"):] - if name not in args.crawler: - parser.remove_section(section) - - # TODO Check if crawlers actually exist - - def load_config(args: argparse.Namespace) -> Config: try: return Config(load_config_parser(args)) @@ -119,9 +101,9 @@ def main() -> None: exit() try: - pferd = Pferd(config) + pferd = Pferd(config, args.crawler) asyncio.run(pferd.run()) - except ConfigOptionError as e: + except (PferdLoadError, ConfigOptionError) as e: log.unlock() log.error(str(e)) exit(1) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 4aee043..75b0e9d 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List, Optional from rich.markup import escape @@ -10,13 +10,22 @@ from .crawlers import CRAWLERS from .logging import log +class PferdLoadError(Exception): + pass + + class Pferd: - def __init__(self, config: Config): + def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]): """ - May throw ConfigOptionError. + May throw PferdLoadError. """ + if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)): + raise PferdLoadError("Some crawlers were selected multiple times") + self._config = config + self._crawlers_to_run = crawlers_to_run + self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} @@ -31,9 +40,13 @@ class Pferd: authenticator = authenticator_constructor(name, section, self._config) self._authenticators[name] = authenticator - def _load_crawlers(self) -> None: + def _load_crawlers(self) -> List[str]: + names = [] + for name, section in self._config.crawler_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") + names.append(name) + crawl_type = section.get("type") crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: @@ -42,15 +55,56 @@ class Pferd: crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler + return names + + def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + + if self._crawlers_to_run is None: + log.explain("No crawlers specified on CLI") + log.explain("Running all loaded crawlers") + return loaded_crawlers + + log.explain("Crawlers specified on CLI") + + names: List[str] = [] # With 'crawl:' prefix + unknown_names = [] # Without 'crawl:' prefix + + for name in self._crawlers_to_run: + section_name = f"crawl:{name}" + if section_name in self._crawlers: + log.explain(f"Found crawler section named {section_name!r}") + names.append(section_name) + else: + log.explain(f"There's no crawler section named {section_name!r}") + unknown_names.append(name) + + if unknown_names: + if len(unknown_names) == 1: + [name] = unknown_names + raise PferdLoadError(f"There is no crawler named {name!r}") + else: + names_str = ", ".join(repr(name) for name in unknown_names) + raise PferdLoadError(f"There are no crawlers named {names_str}") + + return names + async def run(self) -> None: + """ + May throw PferdLoadError or ConfigOptionError. + """ + # These two functions must run inside the same event loop as the # crawlers, so that any new objects (like Conditions or Futures) can # obtain the correct event loop. self._load_authenticators() - self._load_crawlers() + loaded_crawlers = self._load_crawlers() + + log.print("") + + for name in self._find_crawlers_to_run(loaded_crawlers): + crawler = self._crawlers[name] - for name, crawler in self._crawlers.items(): - log.print("") log.print(f"[bold bright_cyan]Running[/] {escape(name)}") try: From 79efdb56f7b5e1e3638bda4896cdb6d9df73a690 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 18:22:29 +0200 Subject: [PATCH 207/524] Adjust ILIAS html explain messages --- PFERD/crawlers/ilias/kit_ilias_html.py | 6 +++--- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_html.py b/PFERD/crawlers/ilias/kit_ilias_html.py index cc02801..636fa68 100644 --- a/PFERD/crawlers/ilias/kit_ilias_html.py +++ b/PFERD/crawlers/ilias/kit_ilias_html.py @@ -52,12 +52,12 @@ class IliasPage: log.explain("Page is a video player, extracting URL") return self._player_to_video() if self._is_video_listing(): - log.explain("Page is a video listing, finding elements") + log.explain("Page is a video listing, searching for elements") return self._find_video_entries() if self._is_exercise_file(): - log.explain("Page is an exercise, finding elements") + log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() - log.explain("Page is a normal folder, finding elements") + log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() def _is_video_player(self) -> bool: diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 1a4a529..41633d2 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -206,6 +206,8 @@ class KitIliasWebCrawler(HttpCrawler): raise CrawlError("Invalid course id? Didn't find anything looking like a course") # Duplicated code, but the root page is special - we want to avoid fetching it twice! + log.explain_topic("Parsing root HTML page") + log.explain(f"URL: {url}") page = IliasPage(soup, url, None) elements.extend(page.get_child_elements()) @@ -229,8 +231,8 @@ class KitIliasWebCrawler(HttpCrawler): elements.clear() async with cl: soup = await self._get_page(url) - log.explain_topic(f"Parsing HTML page for {path!r}") - log.explain(f"URL: {url!r}") + log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") + log.explain(f"URL: {url}") page = IliasPage(soup, url, parent) elements.extend(page.get_child_elements()) From d97d6bf147903b245b61f17254674ac6b53d7061 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 18:29:28 +0200 Subject: [PATCH 208/524] Fix handling nested ILIAS folders --- PFERD/crawlers/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py index 41633d2..7e1562c 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawlers/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() - tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] + tasks = [self._handle_ilias_element(path, element) for element in elements] # And execute them await self.gather(tasks) From 445dffc987e7dd62913bf8e941c01464a07f7577 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 18:35:32 +0200 Subject: [PATCH 209/524] Reword some explanations --- PFERD/output_dir.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index b07fe3e..b850a03 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -197,7 +197,7 @@ class OutputDirectory: if remote_newer: log.explain("Remote file seems to be newer") else: - log.explain("Local file seems to be newer") + log.explain("Remote file doesn't seem to be newer") if redownload == Redownload.NEVER_SMART: if remote_newer is None: @@ -344,12 +344,12 @@ class OutputDirectory: # Detect and solve local-dir-remote-file conflict if local_path.is_dir(): - log.explain("Conflict: There's a dir in place of the local file") + log.explain("Conflict: There's a directory in place of the local file") if await self._conflict_ldrf(on_conflict, path): - log.explain("Result: Delete the dir") + log.explain("Result: Delete the obstructing directory") shutil.rmtree(local_path) else: - log.explain("Result: Keep the dir") + log.explain("Result: Keep the obstructing directory") return None # Detect and solve local-file-remote-dir conflict @@ -399,20 +399,19 @@ class OutputDirectory: if filecmp.cmp(info.local_path, info.tmp_path): log.explain("Contents identical with existing file") - log.explain("Updating metadata on existing file instead") + log.explain("Updating metadata of existing file") self._update_metadata(info) return log.explain("Conflict: The local and remote versions differ") if await self._conflict_lfrf(info.on_conflict, info.path): - log.explain("Result: Keeping the remote version") + log.explain("Result: Replacing local with remote version") else: - log.explain("Result: Keeping the local version") + log.explain("Result: Keeping local version") return - log.explain("Replacing local file with temporary file") info.tmp_path.replace(info.local_path) - log.explain("Updating metadata on local file") + log.explain("Updating file metadata") self._update_metadata(info) if changed: From 74c7b39dc85b4d2693503cd33d2021aea86b0137 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 18:39:25 +0200 Subject: [PATCH 210/524] Clean up files in alphabetical order --- PFERD/output_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index b850a03..8d1c6b1 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -431,7 +431,7 @@ class OutputDirectory: await self._cleanup_file(path, pure) async def _cleanup_dir(self, path: Path, pure: PurePath, delete_self: bool = True) -> None: - for child in path.iterdir(): + for child in sorted(path.iterdir()): pure_child = pure / child.name await self._cleanup(child, pure_child) From e4e5e83be62c92ebbade8c8b7ea077c171f21e7f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 18:39:43 +0200 Subject: [PATCH 211/524] Fix downloader using crawl bar Looks like I made a dumb copy-paste error. Now the download bar shows the proper progress and speed again. --- PFERD/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/crawler.py b/PFERD/crawler.py index e3aef8f..e73ce72 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -120,9 +120,9 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" # The "Downloaded ..." message is printed in the output dir, not here - await self._stack.enter_async_context(self._limiter.limit_crawl()) + await self._stack.enter_async_context(self._limiter.limit_download()) sink = await self._stack.enter_async_context(self._fs_token) - bar = self._stack.enter_context(log.crawl_bar(bar_desc)) + bar = self._stack.enter_context(log.download_bar(bar_desc)) return bar, sink From 5edd868d5bbd44b7c8acc3e31db1adfc5a91dae8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 18:49:34 +0200 Subject: [PATCH 212/524] Fix always-smart redownloading the wrong files --- PFERD/output_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 8d1c6b1..06cfe6b 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -208,7 +208,7 @@ class OutputDirectory: if remote_newer is None: return True else: - return not remote_newer + return remote_newer # This should never be reached raise ValueError(f"{redownload!r} is not a valid redownload policy") From 37f8d84a9c1fcadeee684f08ae8d3036f5a82213 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 19:00:01 +0200 Subject: [PATCH 213/524] Output total amount of http requests in HTTP Crawler --- PFERD/http_crawler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PFERD/http_crawler.py b/PFERD/http_crawler.py index adbac5d..c6e679d 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/http_crawler.py @@ -27,6 +27,7 @@ class HttpCrawler(Crawler): self._authentication_id = 0 self._authentication_lock = asyncio.Lock() self._current_cookie_jar: Optional[aiohttp.CookieJar] = None + self._request_count = 0 async def _current_auth_id(self) -> int: """ @@ -41,6 +42,7 @@ class HttpCrawler(Crawler): # This should reduce the amount of requests we make: If an authentication is in progress # all future requests wait for authentication to complete. async with self._authentication_lock: + self._request_count += 1 return self._authentication_id async def authenticate(self, caller_auth_id: int) -> None: @@ -85,6 +87,7 @@ class HttpCrawler(Crawler): async def run(self) -> None: self._current_cookie_jar = aiohttp.CookieJar() + self._request_count = 0 try: self._current_cookie_jar.load(self._cookie_jar_path) @@ -100,6 +103,7 @@ class HttpCrawler(Crawler): await super().run() finally: del self.session + log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") # They are saved in authenticate, but a final save won't hurt await self._save_cookies() From bbf9f8f130f8087604548e1716fc94bef22dc5a8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 19:05:56 +0200 Subject: [PATCH 214/524] Add -C as alias for --crawler --- PFERD/cli/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 71d9732..bd62b6e 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -88,7 +88,7 @@ PARSER.add_argument( " Uses default config file path if no path is specified" ) PARSER.add_argument( - "--crawler", + "--crawler", "-C", action="append", type=str, metavar="NAME", From 2fdf24495b1655feb7a2e6a2ef349d19e3442ef3 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 19:16:42 +0200 Subject: [PATCH 215/524] Restructure crawling and auth related modules --- PFERD/{authenticators => auth}/__init__.py | 2 +- PFERD/{ => auth}/authenticator.py | 2 +- PFERD/{authenticators => auth}/simple.py | 2 +- PFERD/{authenticators => auth}/tfa.py | 2 +- PFERD/{crawlers => crawl}/__init__.py | 6 +++--- PFERD/{ => crawl}/crawler.py | 16 ++++++++-------- PFERD/{ => crawl}/http_crawler.py | 8 ++++---- PFERD/{crawlers => crawl}/ilias/__init__.py | 0 .../{crawlers => crawl}/ilias/file_templates.py | 0 .../{crawlers => crawl}/ilias/kit_ilias_html.py | 0 .../ilias/kit_ilias_web_crawler.py | 16 +++++++--------- .../local.py => crawl/local_crawler.py} | 2 +- PFERD/pferd.py | 6 ++---- 13 files changed, 29 insertions(+), 33 deletions(-) rename PFERD/{authenticators => auth}/__init__.py (91%) rename PFERD/{ => auth}/authenticator.py (98%) rename PFERD/{authenticators => auth}/simple.py (96%) rename PFERD/{authenticators => auth}/tfa.py (93%) rename PFERD/{crawlers => crawl}/__init__.py (82%) rename PFERD/{ => crawl}/crawler.py (96%) rename PFERD/{ => crawl}/http_crawler.py (97%) rename PFERD/{crawlers => crawl}/ilias/__init__.py (100%) rename PFERD/{crawlers => crawl}/ilias/file_templates.py (100%) rename PFERD/{crawlers => crawl}/ilias/kit_ilias_html.py (100%) rename PFERD/{crawlers => crawl}/ilias/kit_ilias_web_crawler.py (98%) rename PFERD/{crawlers/local.py => crawl/local_crawler.py} (98%) diff --git a/PFERD/authenticators/__init__.py b/PFERD/auth/__init__.py similarity index 91% rename from PFERD/authenticators/__init__.py rename to PFERD/auth/__init__.py index 35096cf..6247e2b 100644 --- a/PFERD/authenticators/__init__.py +++ b/PFERD/auth/__init__.py @@ -1,8 +1,8 @@ from configparser import SectionProxy from typing import Callable, Dict -from ..authenticator import Authenticator, AuthSection from ..config import Config +from .authenticator import Authenticator, AuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator diff --git a/PFERD/authenticator.py b/PFERD/auth/authenticator.py similarity index 98% rename from PFERD/authenticator.py rename to PFERD/auth/authenticator.py index d67b263..9217dcd 100644 --- a/PFERD/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import Tuple -from .config import Config, Section +from ..config import Config, Section class AuthLoadException(Exception): diff --git a/PFERD/authenticators/simple.py b/PFERD/auth/simple.py similarity index 96% rename from PFERD/authenticators/simple.py rename to PFERD/auth/simple.py index bcbe69c..a12c359 100644 --- a/PFERD/authenticators/simple.py +++ b/PFERD/auth/simple.py @@ -1,9 +1,9 @@ from typing import Optional, Tuple -from ..authenticator import Authenticator, AuthException, AuthSection from ..config import Config from ..logging import log from ..utils import agetpass, ainput +from .authenticator import Authenticator, AuthException, AuthSection class SimpleAuthSection(AuthSection): diff --git a/PFERD/authenticators/tfa.py b/PFERD/auth/tfa.py similarity index 93% rename from PFERD/authenticators/tfa.py rename to PFERD/auth/tfa.py index b0eef18..670626d 100644 --- a/PFERD/authenticators/tfa.py +++ b/PFERD/auth/tfa.py @@ -1,9 +1,9 @@ from typing import Tuple -from ..authenticator import Authenticator, AuthException, AuthSection from ..config import Config from ..logging import log from ..utils import ainput +from .authenticator import Authenticator, AuthException, AuthSection class TfaAuthenticator(Authenticator): diff --git a/PFERD/crawlers/__init__.py b/PFERD/crawl/__init__.py similarity index 82% rename from PFERD/crawlers/__init__.py rename to PFERD/crawl/__init__.py index dc7dfa0..297c490 100644 --- a/PFERD/crawlers/__init__.py +++ b/PFERD/crawl/__init__.py @@ -1,11 +1,11 @@ from configparser import SectionProxy from typing import Callable, Dict -from ..authenticator import Authenticator +from ..auth import Authenticator from ..config import Config -from ..crawler import Crawler +from .crawler import Crawler, CrawlError # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection -from .local import LocalCrawler, LocalCrawlerSection +from .local_crawler import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ str, # Name (without the "crawl:" prefix) diff --git a/PFERD/crawler.py b/PFERD/crawl/crawler.py similarity index 96% rename from PFERD/crawler.py rename to PFERD/crawl/crawler.py index e73ce72..a79e968 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawl/crawler.py @@ -6,14 +6,14 @@ from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Tup from rich.markup import escape -from .authenticator import Authenticator -from .config import Config, Section -from .limiter import Limiter -from .logging import ProgressBar, log -from .output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload -from .report import MarkConflictError, MarkDuplicateError -from .transformer import Transformer -from .utils import ReusableAsyncContextManager, fmt_path +from ..auth import Authenticator +from ..config import Config, Section +from ..limiter import Limiter +from ..logging import ProgressBar, log +from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload +from ..report import MarkConflictError, MarkDuplicateError +from ..transformer import Transformer +from ..utils import ReusableAsyncContextManager, fmt_path class CrawlWarning(Exception): diff --git a/PFERD/http_crawler.py b/PFERD/crawl/http_crawler.py similarity index 97% rename from PFERD/http_crawler.py rename to PFERD/crawl/http_crawler.py index c6e679d..e82dfed 100644 --- a/PFERD/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -4,11 +4,11 @@ from typing import Optional import aiohttp -from .config import Config +from ..config import Config +from ..logging import log +from ..utils import fmt_real_path +from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection -from .logging import log -from .utils import fmt_real_path -from .version import NAME, VERSION class HttpCrawler(Crawler): diff --git a/PFERD/crawlers/ilias/__init__.py b/PFERD/crawl/ilias/__init__.py similarity index 100% rename from PFERD/crawlers/ilias/__init__.py rename to PFERD/crawl/ilias/__init__.py diff --git a/PFERD/crawlers/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py similarity index 100% rename from PFERD/crawlers/ilias/file_templates.py rename to PFERD/crawl/ilias/file_templates.py diff --git a/PFERD/crawlers/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py similarity index 100% rename from PFERD/crawlers/ilias/kit_ilias_html.py rename to PFERD/crawl/ilias/kit_ilias_html.py diff --git a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py similarity index 98% rename from PFERD/crawlers/ilias/kit_ilias_web_crawler.py rename to PFERD/crawl/ilias/kit_ilias_web_crawler.py index 7e1562c..33356ed 100644 --- a/PFERD/crawlers/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -6,15 +6,13 @@ import aiohttp from aiohttp import hdrs from bs4 import BeautifulSoup, Tag -from PFERD.authenticators import Authenticator -from PFERD.config import Config -from PFERD.crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical -from PFERD.http_crawler import HttpCrawler -from PFERD.logging import ProgressBar, log -from PFERD.output_dir import FileSink, Redownload -from PFERD.utils import soupify, url_set_query_param - -from ...utils import fmt_path +from ...auth import Authenticator +from ...config import Config +from ...logging import ProgressBar, log +from ...output_dir import FileSink, Redownload +from ...utils import fmt_path, soupify, url_set_query_param +from ..crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical +from ..http_crawler import HttpCrawler from .file_templates import link_template_plain, link_template_rich from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement diff --git a/PFERD/crawlers/local.py b/PFERD/crawl/local_crawler.py similarity index 98% rename from PFERD/crawlers/local.py rename to PFERD/crawl/local_crawler.py index 35e5829..7958169 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawl/local_crawler.py @@ -5,7 +5,7 @@ from pathlib import Path, PurePath from typing import Optional from ..config import Config -from ..crawler import Crawler, CrawlerSection, anoncritical +from .crawler import Crawler, CrawlerSection, anoncritical class LocalCrawlerSection(CrawlerSection): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 75b0e9d..5b5b866 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -2,11 +2,9 @@ from typing import Dict, List, Optional from rich.markup import escape -from .authenticator import Authenticator -from .authenticators import AUTHENTICATORS +from .auth import AUTHENTICATORS, Authenticator from .config import Config, ConfigOptionError -from .crawler import Crawler, CrawlError -from .crawlers import CRAWLERS +from .crawl import CRAWLERS, Crawler, CrawlError from .logging import log From 6e9f8fd39107ce2ca0a11b5dd9f08b9d7dfa7cf2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 19:44:12 +0200 Subject: [PATCH 216/524] Add a keyring authenticator --- CONFIG.md | 9 +++++ PFERD/auth/__init__.py | 3 ++ PFERD/auth/keyring_authenticator.py | 56 +++++++++++++++++++++++++++++ setup.cfg | 1 + 4 files changed, 69 insertions(+) create mode 100644 PFERD/auth/keyring_authenticator.py diff --git a/CONFIG.md b/CONFIG.md index e92858f..bd3baca 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -154,6 +154,15 @@ This authenticator prompts the user on the console for a two-factor authentication token. The token is provided as password and it is not cached. This authenticator does not support usernames. +### The `keyring` authenticator + +This authenticator uses the system keyring to store passwords. It expects a +username in the config and will prompt *once* for the password. After that it +receives the password from the system keyring. + +- `username`: The username. (Required) +- `keyring_name`: The service name PFERD uses for storing credentials. (Optional) + ## Transformation rules Transformation rules are rules for renaming and excluding files and directories. diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 6247e2b..81ec31d 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -3,6 +3,7 @@ from typing import Callable, Dict from ..config import Config from .authenticator import Authenticator, AuthSection +from .keyring_authenticator import KeyringAuthenticator, KeyringAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -17,4 +18,6 @@ AUTHENTICATORS: Dict[str, AuthConstructor] = { SimpleAuthenticator(n, SimpleAuthSection(s), c), "tfa": lambda n, s, c: TfaAuthenticator(n, AuthSection(s), c), + "keyring": lambda n, s, c: + KeyringAuthenticator(n, KeyringAuthSection(s), c) } diff --git a/PFERD/auth/keyring_authenticator.py b/PFERD/auth/keyring_authenticator.py new file mode 100644 index 0000000..413c7ad --- /dev/null +++ b/PFERD/auth/keyring_authenticator.py @@ -0,0 +1,56 @@ +from typing import Optional, Tuple + +import keyring + +from ..config import Config +from ..logging import log +from ..utils import agetpass +from ..version import NAME +from .authenticator import Authenticator, AuthException, AuthSection + + +class KeyringAuthSection(AuthSection): + def username(self) -> str: + name = self.s.get("username") + if name is None: + self.missing_value("username") + return name + + def keyring_name(self) -> str: + return self.s.get("keyring_name", fallback=NAME) + + +class KeyringAuthenticator(Authenticator): + + def __init__( + self, + name: str, + section: KeyringAuthSection, + config: Config, + ) -> None: + super().__init__(name, section, config) + + self._username = section.username() + self._password: Optional[str] = None + self._keyring_name = section.keyring_name() + + async def credentials(self) -> Tuple[str, str]: + if self._password is not None: + return self._username, self._password + + password = keyring.get_password(self._keyring_name, self._username) + + if not password: + async with log.exclusive_output(): + password = await agetpass("Password: ") + keyring.set_password(self._keyring_name, self._username, password) + + self._password = password + + return self._username, password + + def invalidate_credentials(self) -> None: + self.invalidate_password() + + def invalidate_password(self) -> None: + raise AuthException("Invalid password") diff --git a/setup.cfg b/setup.cfg index 431c3b9..1cbfc6a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,7 @@ install_requires = aiohttp>=3.7.4.post0 beautifulsoup4>=4.9.3 rich>=10.1.0 + keyring>=23.0.1 [options.entry_points] console_scripts = From 6ca0ecdf05b85c1986de50724443aaabb5755506 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 20:46:12 +0200 Subject: [PATCH 217/524] Load and store reports --- PFERD/crawl/crawler.py | 12 +++++++- PFERD/output_dir.py | 35 ++++++++++++++++++++- PFERD/report.py | 69 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 110 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index a79e968..60ea708 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -11,7 +11,7 @@ from ..config import Config, Section from ..limiter import Limiter from ..logging import ProgressBar, log from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload -from ..report import MarkConflictError, MarkDuplicateError +from ..report import MarkConflictError, MarkDuplicateError, Report from ..transformer import Transformer from ..utils import ReusableAsyncContextManager, fmt_path @@ -229,6 +229,14 @@ class Crawler(ABC): section.on_conflict(), ) + @property + def report(self) -> Report: + return self._output_dir.report + + @property + def prev_report(self) -> Optional[Report]: + return self._output_dir.prev_report + @staticmethod async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]: """ @@ -298,8 +306,10 @@ class Crawler(ABC): with log.show_progress(): self._output_dir.prepare() + self._output_dir.load_prev_report() await self._run() await self._cleanup() + self._output_dir.store_report() @abstractmethod async def _run(self) -> None: diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 06cfe6b..d60a312 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -1,4 +1,5 @@ import filecmp +import json import os import random import shutil @@ -13,7 +14,7 @@ from typing import BinaryIO, Iterator, Optional, Tuple from rich.markup import escape from .logging import log -from .report import Report +from .report import Report, ReportLoadError from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no SUFFIX_CHARS = string.ascii_lowercase + string.digits @@ -134,6 +135,8 @@ class FileSinkToken(ReusableAsyncContextManager[FileSink]): class OutputDirectory: + REPORT_FILE = PurePath(".report") + def __init__( self, root: Path, @@ -144,7 +147,19 @@ class OutputDirectory: self._redownload = redownload self._on_conflict = on_conflict + self._report_path = self.resolve(self.REPORT_FILE) self._report = Report() + self._prev_report: Optional[Report] = None + + self.register_reserved(self.REPORT_FILE) + + @property + def report(self) -> Report: + return self._report + + @property + def prev_report(self) -> Optional[Report]: + return self._prev_report def prepare(self) -> None: log.explain_topic(f"Creating base directory at {fmt_real_path(self._root)}") @@ -452,3 +467,21 @@ class OutputDirectory: self._report.delete_file(pure) except OSError: pass + + def load_prev_report(self) -> None: + log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") + try: + self._prev_report = Report.load(self._report_path) + log.explain("Loaded report successfully") + except (OSError, json.JSONDecodeError, ReportLoadError) as e: + log.explain("Failed to load report") + log.explain(str(e)) + + def store_report(self) -> None: + log.explain_topic(f"Storing report to {fmt_real_path(self._report_path)}") + try: + self._report.store(self._report_path) + log.explain("Stored report successfully") + except OSError as e: + log.warn(f"Failed to save report to {fmt_real_path(self._report_path)}") + log.warn_contd(str(e)) diff --git a/PFERD/report.py b/PFERD/report.py index 7d8aa85..619c621 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -1,5 +1,10 @@ -from pathlib import PurePath -from typing import Set +import json +from pathlib import Path, PurePath +from typing import Any, Dict, List, Set + + +class ReportLoadError(Exception): + pass class MarkDuplicateError(Exception): @@ -48,10 +53,66 @@ class Report: self.reserved_files: Set[PurePath] = set() self.known_files: Set[PurePath] = set() - self.new_files: Set[PurePath] = set() + self.added_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + @staticmethod + def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: + result: Any = data.get(key, []) + + if not isinstance(result, list): + raise ReportLoadError(f"Incorrect format: {key!r} is not a list") + + for elem in result: + if not isinstance(elem, str): + raise ReportLoadError(f"Incorrect format: {key!r} must contain only strings") + + return result + + @classmethod + def load(cls, path: Path) -> "Report": + """ + May raise OSError, JsonDecodeError, ReportLoadError. + """ + + with open(path) as f: + data = json.load(f) + + if not isinstance(data, dict): + raise ReportLoadError("Incorrect format: Root is not an object") + + self = cls() + for elem in self._get_list_of_strs(data, "reserved"): + self.mark_reserved(PurePath(elem)) + for elem in self._get_list_of_strs(data, "known"): + self.mark(PurePath(elem)) + for elem in self._get_list_of_strs(data, "added"): + self.add_file(PurePath(elem)) + for elem in self._get_list_of_strs(data, "changed"): + self.change_file(PurePath(elem)) + for elem in self._get_list_of_strs(data, "deleted"): + self.delete_file(PurePath(elem)) + + return self + + def store(self, path: Path) -> None: + """ + May raise OSError. + """ + + data = { + "reserved": [str(path) for path in sorted(self.reserved_files)], + "known": [str(path) for path in sorted(self.known_files)], + "added": [str(path) for path in sorted(self.added_files)], + "changed": [str(path) for path in sorted(self.changed_files)], + "deleted": [str(path) for path in sorted(self.deleted_files)], + } + + with open(path, "w") as f: + json.dump(data, f, indent=2, sort_keys=True) + f.write("\n") # json.dump doesn't do this + def mark_reserved(self, path: PurePath) -> None: self.reserved_files.add(path) @@ -84,7 +145,7 @@ class Report: Unlike mark(), this function accepts any paths. """ - self.new_files.add(path) + self.added_files.add(path) def change_file(self, path: PurePath) -> None: """ From 9cce78669fc3fe418dde29b643804c7e38755af7 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 21:13:06 +0200 Subject: [PATCH 218/524] Print report after all crawlers have finished --- PFERD/pferd.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 5b5b866..2b9921e 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -6,6 +6,7 @@ from .auth import AUTHENTICATORS, Authenticator from .config import Config, ConfigOptionError from .crawl import CRAWLERS, Crawler, CrawlError from .logging import log +from .utils import fmt_path class PferdLoadError(Exception): @@ -97,10 +98,11 @@ class Pferd: # obtain the correct event loop. self._load_authenticators() loaded_crawlers = self._load_crawlers() + names = self._find_crawlers_to_run(loaded_crawlers) log.print("") - for name in self._find_crawlers_to_run(loaded_crawlers): + for name in names: crawler = self._crawlers[name] log.print(f"[bold bright_cyan]Running[/] {escape(name)}") @@ -111,3 +113,23 @@ class Pferd: log.error(str(e)) except Exception: log.unexpected_exception() + + for name in names: + crawler = self._crawlers[name] + + log.report("") + log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") + + something_happened = False + for path in sorted(crawler.report.added_files): + something_happened = True + log.report(f" [bold bright_green]Added[/] {fmt_path(path)}") + for path in sorted(crawler.report.changed_files): + something_happened = True + log.report(f" [bold bright_yellow]Changed[/] {fmt_path(path)}") + for path in sorted(crawler.report.deleted_files): + something_happened = True + log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") + + if not something_happened: + log.report(" Nothing happened") From ce1dbda5b4144884db48aa9041db32a162b56e1c Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 21:27:37 +0200 Subject: [PATCH 219/524] Overhaul colours "Crawled" and "Downloaded" are now printed less bright than "Crawling" and "Downloading" as they're not as important. Explain topics are printed in yellow to stand out a bit more from the cyan action messages. --- PFERD/crawl/crawler.py | 2 +- PFERD/logging.py | 2 +- PFERD/output_dir.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 60ea708..cefefad 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -99,7 +99,7 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]): async def _on_aenter(self) -> ProgressBar: bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" - after_desc = f"[bold bright_cyan]Crawled[/] {escape(fmt_path(self._path))}" + after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" self._stack.callback(lambda: log.action(after_desc)) await self._stack.enter_async_context(self._limiter.limit_crawl()) diff --git a/PFERD/logging.py b/PFERD/logging.py index e97a3fa..fc13e13 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -181,7 +181,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new """ if self.output_explain: - self.print(f"[cyan]{escape(text)}") + self.print(f"[yellow]{escape(text)}") def explain(self, text: str) -> None: """ diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index d60a312..cd68ead 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -399,7 +399,7 @@ class OutputDirectory: async def _after_download(self, info: DownloadInfo) -> None: with self._ensure_deleted(info.tmp_path): - log.action(f"[bold bright_cyan]Downloaded[/] {fmt_path(info.remote_path)}") + log.action(f"[bold cyan]Downloaded[/] {fmt_path(info.remote_path)}") log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") changed = False From 0ca06801654acaeb3c99440ce7073c9fff7395c1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 21:40:48 +0200 Subject: [PATCH 220/524] Simplify --version --- PFERD/__main__.py | 5 ----- PFERD/cli/parser.py | 5 +++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 9bc2974..8cb6a11 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -8,7 +8,6 @@ from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log from .pferd import Pferd, PferdLoadError from .transformer import RuleParseError -from .version import NAME, VERSION def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: @@ -81,10 +80,6 @@ def dump_config(args: argparse.Namespace, config: Config) -> None: def main() -> None: args = PARSER.parse_args() - if args.version: - print(f"{NAME} {VERSION}") - exit() - # Configuring logging happens in two stages because CLI args have # precedence over config file options and loading the config already # produces some kinds of log messages (usually only explain()-s). diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index bd62b6e..af5c340 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -3,6 +3,7 @@ import configparser from pathlib import Path from ..output_dir import OnConflict, Redownload +from ..version import NAME, VERSION CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( @@ -70,8 +71,8 @@ PARSER = argparse.ArgumentParser() PARSER.set_defaults(command=None) PARSER.add_argument( "--version", - action="store_true", - help="print version and exit" + action="version", + version=f"{NAME} {VERSION}", ) PARSER.add_argument( "--config", "-c", From 1f400d59643dc8010b2828e9c6e11d29be673423 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 22:26:41 +0200 Subject: [PATCH 221/524] Implement BooleanOptionalAction --- PFERD/cli/parser.py | 58 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index af5c340..66c5959 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -1,10 +1,62 @@ import argparse import configparser from pathlib import Path +from typing import Any, List, Optional, Sequence, Union from ..output_dir import OnConflict, Redownload from ..version import NAME, VERSION + +# TODO Replace with argparse version when updating to 3.9? +class BooleanOptionalAction(argparse.Action): + def __init__( + self, + option_strings: List[str], + dest: Any, + default: Any = None, + type: Any = None, + choices: Any = None, + required: Any = False, + help: Any = None, + metavar: Any = None, + ): + if len(option_strings) != 1: + raise ValueError("There must be exactly one option string") + [self.name] = option_strings + if not self.name.startswith("--"): + raise ValueError(f"{self.name!r} doesn't start with '--'") + if self.name.startswith("--no-"): + raise ValueError(f"{self.name!r} starts with '--no-'") + + options = [self.name, "--no-" + self.name[2:]] + + super().__init__( + options, + dest, + nargs=0, + default=default, + type=type, + choices=choices, + required=required, + help=help, + metavar=metavar, + ) + + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: Union[str, Sequence[Any], None], + option_string: Optional[str] = None, + ) -> None: + if option_string and option_string in self.option_strings: + value = not option_string.startswith("--no-") + setattr(namespace, self.dest, value) + + def format_usage(self) -> str: + return "--[no-]" + self.name[2:] + + CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( title="general crawler arguments", @@ -103,10 +155,8 @@ PARSER.add_argument( help="custom working directory" ) PARSER.add_argument( - "--explain", "-e", - # TODO Use argparse.BooleanOptionalAction after updating to 3.9 - action="store_const", - const=True, + "--explain", + action=BooleanOptionalAction, help="log and explain in detail what PFERD is doing" ) From 27b5a8e4905746c365ea8d7fa076804a4440410b Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 22:39:07 +0200 Subject: [PATCH 222/524] Rename log.action to log.status --- PFERD/crawl/crawler.py | 2 +- PFERD/logging.py | 6 +++--- PFERD/output_dir.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index cefefad..d6d4abc 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -101,7 +101,7 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]): bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" - self._stack.callback(lambda: log.action(after_desc)) + self._stack.callback(lambda: log.status(after_desc)) await self._stack.enter_async_context(self._limiter.limit_crawl()) bar = self._stack.enter_context(log.crawl_bar(bar_desc)) diff --git a/PFERD/logging.py b/PFERD/logging.py index fc13e13..5025d88 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -55,7 +55,7 @@ class Log: # Whether different parts of the output are enabled or disabled self.output_explain = False - self.output_action = True + self.output_status = True self.output_report = True def _update_live(self) -> None: @@ -191,12 +191,12 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def action(self, text: str) -> None: + def status(self, text: str) -> None: """ Print a status update while crawling. Allows markup. """ - if self.output_action: + if self.output_status: self.print(text) def report(self, text: str) -> None: diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index cd68ead..17cb772 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -399,7 +399,7 @@ class OutputDirectory: async def _after_download(self, info: DownloadInfo) -> None: with self._ensure_deleted(info.tmp_path): - log.action(f"[bold cyan]Downloaded[/] {fmt_path(info.remote_path)}") + log.status(f"[bold cyan]Downloaded[/] {fmt_path(info.remote_path)}") log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") changed = False @@ -430,10 +430,10 @@ class OutputDirectory: self._update_metadata(info) if changed: - log.action(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}") + log.status(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}") self._report.change_file(info.path) else: - log.action(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}") + log.status(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}") self._report.add_file(info.path) async def cleanup(self) -> None: @@ -463,7 +463,7 @@ class OutputDirectory: if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() - log.action(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(pure))}") + log.status(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(pure))}") self._report.delete_file(pure) except OSError: pass From edbd92dbbf5a12a04e7d675b2df3cc1c1eba89ca Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 22:41:59 +0200 Subject: [PATCH 223/524] Add --status and --report flags --- PFERD/__main__.py | 10 ++++++++++ PFERD/cli/parser.py | 10 ++++++++++ PFERD/config.py | 6 ++++++ 3 files changed, 26 insertions(+) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 8cb6a11..0fbce59 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -39,11 +39,17 @@ def load_config(args: argparse.Namespace) -> Config: def configure_logging_from_args(args: argparse.Namespace) -> None: if args.explain is not None: log.output_explain = args.explain + if args.status is not None: + log.output_status = args.status + if args.report is not None: + log.output_report = args.report # We want to prevent any unnecessary output if we're printing the config to # stdout, otherwise it would not be a valid config file. if args.dump_config == "-": log.output_explain = False + log.output_status = False + log.output_report = False def configure_logging_from_config(args: argparse.Namespace, config: Config) -> None: @@ -56,6 +62,10 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N try: if args.explain is None: log.output_explain = config.default_section.explain() + if args.status is None: + log.output_status = config.default_section.status() + if args.report is None: + log.output_report = config.default_section.report() except ConfigOptionError as e: log.error(str(e)) exit(1) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 66c5959..346070f 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -159,6 +159,16 @@ PARSER.add_argument( action=BooleanOptionalAction, help="log and explain in detail what PFERD is doing" ) +PARSER.add_argument( + "--status", + action=BooleanOptionalAction, + help="print status updates while PFERD is crawling" +) +PARSER.add_argument( + "--report", + action=BooleanOptionalAction, + help="print a report of all local changes before exiting" +) def load_default_section( diff --git a/PFERD/config.py b/PFERD/config.py index 3c69fc7..0c99683 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -75,6 +75,12 @@ class DefaultSection(Section): def explain(self) -> bool: return self.s.getboolean("explain", fallback=False) + def status(self) -> bool: + return self.s.getboolean("status", fallback=True) + + def report(self) -> bool: + return self.s.getboolean("report", fallback=True) + class Config: @staticmethod From 79be6e1dc541c660b0f39eb23b337baf718ddf06 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 22:49:09 +0200 Subject: [PATCH 224/524] Switch some other options to BooleanOptionalAction --- PFERD/cli/command_kit_ilias_web.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index c743a51..e98f192 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -2,7 +2,7 @@ import argparse import configparser from pathlib import Path -from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler +from .parser import CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, load_crawler SUBPARSER = SUBPARSERS.add_parser( "kit-ilias-web", @@ -27,9 +27,7 @@ GROUP.add_argument( ) GROUP.add_argument( "--videos", - # TODO Use argparse.BooleanOptionalAction after updating to 3.9 - action="store_const", - const=True, + action=BooleanOptionalAction, help="crawl and download videos" ) GROUP.add_argument( @@ -46,9 +44,7 @@ GROUP.add_argument( ) GROUP.add_argument( "--link-file-plaintext", - # TODO Use argparse.BooleanOptionalAction after updating to 3.9 - action="store_const", - const=True, + action=BooleanOptionalAction, help="use plain text files for external links" ) From be4b1040f8a66292408154e05425cf17b1da8003 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 May 2021 22:51:42 +0200 Subject: [PATCH 225/524] Document status and report options --- CONFIG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index bd3baca..b976b7d 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -21,6 +21,10 @@ default values for the other sections. expanded to the current user's home directory. (Default: `.`) - `explain`: Whether PFERD should log and explain its actions and decisions in detail. (Default: `no`) +- `status`: Whether PFERD should print status updates while crawling. (Default: + `yes`) +- `report`: Whether PFERD should print a report of added, changed and deleted + local files for all crawlers before exiting. (Default: `yes`) ## The `crawl:*` sections From 8dd06894207789f763374a128584af13a82dee49 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 23:04:18 +0200 Subject: [PATCH 226/524] Add keyring authentication to ILIAS CLI --- PFERD/cli/command_kit_ilias_web.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index e98f192..e47bc77 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -36,6 +36,11 @@ GROUP.add_argument( metavar="USER_NAME", help="user name for authentication" ) +GROUP.add_argument( + "--keyring", + action=BooleanOptionalAction, + help="use the system keyring to store and retrieve passwords" +) GROUP.add_argument( "--link-file-redirect-delay", type=int, @@ -70,7 +75,11 @@ def load( parser["auth:kit-ilias-web"] = {} auth_section = parser["auth:kit-ilias-web"] - auth_section["type"] = "simple" + + if args.keyring: + auth_section["type"] = "keyring" + else: + auth_section["type"] = "simple" if args.username is not None: auth_section["username"] = str(args.username) From 3ab3581f849ae5ee223c434752dfeffdf30884a9 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 23 May 2021 23:40:28 +0200 Subject: [PATCH 227/524] Add timeout for HTTP connection --- CONFIG.md | 4 +++- PFERD/cli/command_kit_ilias_web.py | 8 ++++++++ PFERD/crawl/http_crawler.py | 10 +++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 ++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index b976b7d..dcc7421 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -140,7 +140,9 @@ This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor S - `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional HTML page instead. -- `no-videos`: If this is set to true, PFERD will not crawl or download any videos. +- `videos`: If this is set to false, PFERD will not crawl or download any videos. +- `http_timeout`: The timeout for http requests + ## Authenticator types ### The `simple` authenticator diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index e47bc77..89da390 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -52,6 +52,12 @@ GROUP.add_argument( action=BooleanOptionalAction, help="use plain text files for external links" ) +GROUP.add_argument( + "--http-timeout", + type=float, + metavar="SECONDS", + help="the timeout to use for HTTP requests" +) def load( @@ -72,6 +78,8 @@ def load( section["link_file_plaintext"] = str(args.link_file_plaintext) if args.videos is not None: section["videos"] = str(False) + if args.http_timeout is not None: + section["http_timeout"] = str(args.http_timeout) parser["auth:kit-ilias-web"] = {} auth_section = parser["auth:kit-ilias-web"] diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index e82dfed..8cd6afe 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -3,6 +3,7 @@ from pathlib import PurePath from typing import Optional import aiohttp +from aiohttp.client import ClientTimeout from ..config import Config from ..logging import log @@ -11,13 +12,18 @@ from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection +class HttpCrawlerSection(CrawlerSection): + def http_timeout(self) -> float: + return self.s.getfloat("http_timeout", fallback=20) + + class HttpCrawler(Crawler): COOKIE_FILE = PurePath(".cookies") def __init__( self, name: str, - section: CrawlerSection, + section: HttpCrawlerSection, config: Config, ) -> None: super().__init__(name, section, config) @@ -28,6 +34,7 @@ class HttpCrawler(Crawler): self._authentication_lock = asyncio.Lock() self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._request_count = 0 + self._http_timeout = section.http_timeout() async def _current_auth_id(self) -> int: """ @@ -97,6 +104,7 @@ class HttpCrawler(Crawler): async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, cookie_jar=self._current_cookie_jar, + timeout=ClientTimeout(total=self._http_timeout) ) as session: self.session = session try: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 33356ed..445997f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,3 +1,4 @@ +import asyncio import re from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union @@ -11,15 +12,15 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlerSection, CrawlWarning, anoncritical -from ..http_crawler import HttpCrawler +from ..crawler import CrawlError, CrawlWarning, anoncritical +from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import link_template_plain, link_template_rich from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] -class KitIliasWebCrawlerSection(CrawlerSection): +class KitIliasWebCrawlerSection(HttpCrawlerSection): def target(self) -> TargetType: target = self.s.get("target") @@ -92,6 +93,8 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: last_exception = e except aiohttp.ClientConnectionError as e: # e.g. timeout, disconnect, resolve failed, etc. last_exception = e + except asyncio.exceptions.TimeoutError as e: # explicit http timeouts in HttpCrawler + last_exception = e log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") if last_exception: From fca62541ca56408addecdbcab57c9f958c772f69 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 00:24:31 +0200 Subject: [PATCH 228/524] De-duplicate element names in ILIAS crawler This prevents any conflicts caused by multiple files with the same name. Conflicts may still arise due to transforms, but that is out of our control and a user error. --- PFERD/crawl/ilias/kit_ilias_html.py | 52 ++++++++++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 636fa68..61df57a 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -461,3 +461,55 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]: + """ + De-duplicates element names by appending an incrementing number to later elements: + test.pdf + test.pdf + would result in + test.pdf + test_1.pdf + + It is also space-aware: + "te st.pdf" + "te st.pdf" + would result in + "tes st.pdf" + "tes st 1.pdf" + """ + known_names = dict() + result_elements = [] + + for element in elements: + # This file is new - add it and mark its name as used + if element.name not in known_names: + known_names[element.name] = 1 + result_elements.append(element) + continue + + # This file is a duplicate. Find a suitable suffix + current_counter = known_names[element.name] + adjusted_element = _append_number(element, current_counter) + # increment the counter so the next duplicate does not conflict + known_names[element.name] += 1 + # also block the new name, so another file with the *renamed* name gets renamed as well + known_names[adjusted_element.name] = 1 + + result_elements.append(adjusted_element) + + return result_elements + + +def _append_number(element: IliasPageElement, number: int) -> IliasPageElement: + extension_index = element.name.rfind(".") + suffix = f" {number}" if " " in element.name else f"_{number}" + if extension_index < 0: + new_name = element.name + suffix + else: + new_name = element.name[:extension_index] + suffix + element.name[extension_index:] + + return IliasPageElement( + element.type, element.url, new_name, element.mtime, element.description + ) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 445997f..222e1d6 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import link_template_plain, link_template_rich -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names TargetType = Union[str, int] @@ -214,6 +214,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() + elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] # And execute them @@ -240,6 +241,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() + elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(path, element) for element in elements] # And execute them From c687d4a51a27bb2121293282a4640c91c5a4ac14 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 13:10:19 +0200 Subject: [PATCH 229/524] Implement cookie sharing --- CONFIG.md | 5 ++ PFERD/cli/parser.py | 9 ++- PFERD/config.py | 3 + PFERD/crawl/http_crawler.py | 80 +++++++++++++++++----- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 ++- PFERD/pferd.py | 10 ++- 6 files changed, 95 insertions(+), 21 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index dcc7421..7e8a717 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -25,6 +25,11 @@ default values for the other sections. `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `share_cookies`: Whether crawlers should share cookies where applicable. By + default, crawlers are isolated and don't interact with each other. This + includes their cookies. However, in situations where multiple crawlers crawl + the same website using the same account, sharing cookies between crawlers can + make sense. (Default: `yes`) ## The `crawl:*` sections diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 346070f..72abb76 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -169,6 +169,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="print a report of all local changes before exiting" ) +PARSER.add_argument( + "--share-cookies", + action=BooleanOptionalAction, + help="whether crawlers should share cookies where applicable" +) def load_default_section( @@ -180,7 +185,9 @@ def load_default_section( if args.working_dir is not None: section["working_dir"] = str(args.working_dir) if args.explain is not None: - section["explain"] = "true" if args.explain else "false" + section["explain"] = "yes" if args.explain else "no" + if args.share_cookies is not None: + section["share_cookies"] = "yes" if args.share_cookies else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 0c99683..abd6e9e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -81,6 +81,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def share_cookies(self) -> bool: + return self.s.getboolean("share_cookies", fallback=True) + class Config: @staticmethod diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 8cd6afe..facc2ba 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,10 +1,11 @@ import asyncio -from pathlib import PurePath -from typing import Optional +from pathlib import Path, PurePath +from typing import Dict, List, Optional import aiohttp from aiohttp.client import ClientTimeout +from ..auth import Authenticator from ..config import Config from ..logging import log from ..utils import fmt_real_path @@ -25,17 +26,22 @@ class HttpCrawler(Crawler): name: str, section: HttpCrawlerSection, config: Config, + shared_auth: Optional[Authenticator] = None, ) -> None: super().__init__(name, section, config) - self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) - self._output_dir.register_reserved(self.COOKIE_FILE) self._authentication_id = 0 self._authentication_lock = asyncio.Lock() - self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._request_count = 0 self._http_timeout = section.http_timeout() + self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) + self._shared_cookie_jar_paths: Optional[List[Path]] = None + self._shared_auth = shared_auth + self._current_cookie_jar: Optional[aiohttp.CookieJar] = None + + self._output_dir.register_reserved(self.COOKIE_FILE) + async def _current_auth_id(self) -> int: """ Returns the id for the current authentication, i.e. an identifier for the last @@ -71,7 +77,7 @@ class HttpCrawler(Crawler): self._authentication_id += 1 # Saving the cookies after the first auth ensures we won't need to re-authenticate # on the next run, should this one be aborted or crash - await self._save_cookies() + self._save_cookies() async def _authenticate(self) -> None: """ @@ -80,26 +86,68 @@ class HttpCrawler(Crawler): """ raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") - async def _save_cookies(self) -> None: + def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: + if not self._shared_auth: + return + + if self._shared_auth in shared: + self._shared_cookie_jar_paths = shared[self._shared_auth] + else: + self._shared_cookie_jar_paths = [] + shared[self._shared_auth] = self._shared_cookie_jar_paths + + self._shared_cookie_jar_paths.append(self._cookie_jar_path) + + def _load_cookies(self) -> None: + log.explain_topic("Loading cookies") + cookie_jar_path: Optional[Path] = None + + if self._shared_cookie_jar_paths is None: + log.explain("Not sharing any cookies") + cookie_jar_path = self._cookie_jar_path + else: + log.explain("Sharing cookies") + max_mtime: Optional[float] = None + for path in self._shared_cookie_jar_paths: + if not path.is_file(): + log.explain(f"{fmt_real_path(path)} is not a file") + continue + mtime = path.stat().st_mtime + if max_mtime is None or mtime > max_mtime: + log.explain(f"{fmt_real_path(path)} has newest mtime so far") + max_mtime = mtime + cookie_jar_path = path + else: + log.explain(f"{fmt_real_path(path)} has older mtime") + + if cookie_jar_path is None: + log.explain("Couldn't find a suitable cookie file") + return + + log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") + try: + self._current_cookie_jar = aiohttp.CookieJar() + self._current_cookie_jar.load(cookie_jar_path) + except Exception as e: + log.explain("Failed to load cookies") + log.explain(str(e)) + + def _save_cookies(self) -> None: log.explain_topic("Saving cookies") if not self._current_cookie_jar: log.explain("No cookie jar, save aborted") return try: + log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") self._current_cookie_jar.save(self._cookie_jar_path) - log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}") - except Exception: + except Exception as e: log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") + log.warn(str(e)) async def run(self) -> None: - self._current_cookie_jar = aiohttp.CookieJar() self._request_count = 0 - - try: - self._current_cookie_jar.load(self._cookie_jar_path) - except Exception: - pass + self._load_cookies() async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, @@ -114,4 +162,4 @@ class HttpCrawler(Crawler): log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") # They are saved in authenticate, but a final save won't hurt - await self._save_cookies() + self._save_cookies() diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 222e1d6..d488974 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler): config: Config, authenticators: Dict[str, Authenticator] ): - super().__init__(name, section, config) + # Setting a main authenticator for cookie sharing + auth = section.auth(authenticators) + super().__init__(name, section, config, shared_auth=auth) self._shibboleth_login = KitShibbolethLogin( - section.auth(authenticators), - section.tfa_auth(authenticators) + auth, + section.tfa_auth(authenticators), ) + self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 2b9921e..35f5194 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,10 +1,11 @@ +from pathlib import Path from typing import Dict, List, Optional from rich.markup import escape from .auth import AUTHENTICATORS, Authenticator from .config import Config, ConfigOptionError -from .crawl import CRAWLERS, Crawler, CrawlError +from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler from .logging import log from .utils import fmt_path @@ -42,6 +43,9 @@ class Pferd: def _load_crawlers(self) -> List[str]: names = [] + # Cookie sharing + kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} + for name, section in self._config.crawler_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") names.append(name) @@ -54,6 +58,10 @@ class Pferd: crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler + if self._config.default_section.share_cookies(): + if isinstance(crawler, KitIliasWebCrawler): + crawler.share_cookies(kit_ilias_web_paths) + return names def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: From 1c1f781be421bfdab29ee942b6ab1892dd948186 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 13:17:28 +0200 Subject: [PATCH 230/524] Reword some log messages --- PFERD/config.py | 2 +- PFERD/pferd.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index abd6e9e..8293331 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -110,7 +110,7 @@ class Config: """ if path: - log.explain("Using custom path") + log.explain("Path specified on CLI") else: log.explain("Using default path") path = Config._default_path() diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 35f5194..bed7c66 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -80,7 +80,7 @@ class Pferd: for name in self._crawlers_to_run: section_name = f"crawl:{name}" if section_name in self._crawlers: - log.explain(f"Found crawler section named {section_name!r}") + log.explain(f"Crawler section named {section_name!r} exists") names.append(section_name) else: log.explain(f"There's no crawler section named {section_name!r}") @@ -128,16 +128,16 @@ class Pferd: log.report("") log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") - something_happened = False + something_changed = False for path in sorted(crawler.report.added_files): - something_happened = True + something_changed = True log.report(f" [bold bright_green]Added[/] {fmt_path(path)}") for path in sorted(crawler.report.changed_files): - something_happened = True + something_changed = True log.report(f" [bold bright_yellow]Changed[/] {fmt_path(path)}") for path in sorted(crawler.report.deleted_files): - something_happened = True + something_changed = True log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") - if not something_happened: - log.report(" Nothing happened") + if not something_changed: + log.report(" Nothing changed") From 5c761930458dc1a181e9014d3e0856d68bf5dc6f Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 15:21:25 +0200 Subject: [PATCH 231/524] Set up pyinstaller --- .gitignore | 5 ++ pferd.py | 4 ++ scripts/build | 5 ++ scripts/check | 2 + scripts/format | 2 + scripts/setup | 3 + sync_url.py | 160 ------------------------------------------------- 7 files changed, 21 insertions(+), 160 deletions(-) create mode 100644 pferd.py create mode 100755 scripts/build delete mode 100755 sync_url.py diff --git a/.gitignore b/.gitignore index 2928b54..455eaca 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,8 @@ /PFERD.egg-info/ __pycache__/ /.vscode/ + +# pyinstaller +/pferd.spec +/build/ +/dist/ diff --git a/pferd.py b/pferd.py new file mode 100644 index 0000000..21857f4 --- /dev/null +++ b/pferd.py @@ -0,0 +1,4 @@ +from PFERD.__main__ import main + +if __name__ == "__main__": + main() diff --git a/scripts/build b/scripts/build new file mode 100755 index 0000000..6f88655 --- /dev/null +++ b/scripts/build @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -e + +pyinstaller --onefile pferd.py diff --git a/scripts/check b/scripts/check index ba767cd..2283951 100755 --- a/scripts/check +++ b/scripts/check @@ -1,4 +1,6 @@ #!/usr/bin/env bash +set -e + mypy PFERD flake8 PFERD diff --git a/scripts/format b/scripts/format index cc196ae..d8917ef 100755 --- a/scripts/format +++ b/scripts/format @@ -1,4 +1,6 @@ #!/usr/bin/env bash +set -e + autopep8 --recursive --in-place PFERD isort PFERD diff --git a/scripts/setup b/scripts/setup index 8a5399b..46fe2f8 100755 --- a/scripts/setup +++ b/scripts/setup @@ -1,5 +1,8 @@ #!/usr/bin/env bash +set -e + pip install --upgrade pip setuptools pip install --editable . pip install --upgrade mypy flake8 autopep8 isort +pip install --upgrade pyinstaller diff --git a/sync_url.py b/sync_url.py deleted file mode 100755 index ca78de0..0000000 --- a/sync_url.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python - -""" -A simple script to download a course by name from ILIAS. -""" - -import argparse -import logging -import sys -from pathlib import Path, PurePath -from typing import Optional -from urllib.parse import urlparse - -from PFERD import Pferd -from PFERD.authenticators import KeyringAuthenticator, UserPassAuthenticator -from PFERD.cookie_jar import CookieJar -from PFERD.ilias import (IliasCrawler, IliasElementType, - KitShibbolethAuthenticator) -from PFERD.logging import PrettyLogger, enable_logging -from PFERD.organizer import (ConflictType, FileConflictResolution, - FileConflictResolver, resolve_prompt_user) -from PFERD.transform import sanitize_windows_path -from PFERD.utils import to_path - -_LOGGER = logging.getLogger("sync_url") -_PRETTY = PrettyLogger(_LOGGER) - - -def _extract_credentials(file_path: Optional[str]) -> UserPassAuthenticator: - if not file_path: - return UserPassAuthenticator("KIT ILIAS Shibboleth", None, None) - - if not Path(file_path).exists(): - _PRETTY.error("Credential file does not exist") - sys.exit(1) - - with open(file_path, "r") as file: - first_line = file.read().splitlines()[0] - read_name, *read_password = first_line.split(":", 1) - - name = read_name if read_name else None - password = read_password[0] if read_password else None - return UserPassAuthenticator("KIT ILIAS Shibboleth", username=name, password=password) - - -def _resolve_remote_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: - return FileConflictResolution.DESTROY_EXISTING - - -def _resolve_local_first(_path: PurePath, _conflict: ConflictType) -> FileConflictResolution: - return FileConflictResolution.KEEP_EXISTING - - -def _resolve_no_delete(_path: PurePath, conflict: ConflictType) -> FileConflictResolution: - # Update files - if conflict == ConflictType.FILE_OVERWRITTEN: - return FileConflictResolution.DESTROY_EXISTING - if conflict == ConflictType.MARKED_FILE_OVERWRITTEN: - return FileConflictResolution.DESTROY_EXISTING - # But do not delete them - return FileConflictResolution.KEEP_EXISTING - - -def main() -> None: - enable_logging(name="sync_url") - - parser = argparse.ArgumentParser() - parser.add_argument("--test-run", action="store_true") - parser.add_argument('-c', '--cookies', nargs='?', default=None, help="File to store cookies in") - parser.add_argument('-u', '--username', nargs='?', default=None, help="Username for Ilias") - parser.add_argument('-p', '--password', nargs='?', default=None, help="Password for Ilias") - parser.add_argument('--credential-file', nargs='?', default=None, - help="Path to a file containing credentials for Ilias. The file must have " - "one line in the following format: ':'") - parser.add_argument("-k", "--keyring", action="store_true", - help="Use the system keyring service for authentication") - parser.add_argument('--no-videos', action="store_true", help="Don't download videos") - parser.add_argument('--local-first', action="store_true", - help="Don't prompt for confirmation, keep existing files") - parser.add_argument('--remote-first', action="store_true", - help="Don't prompt for confirmation, delete and overwrite local files") - parser.add_argument('--no-delete', action="store_true", - help="Don't prompt for confirmation, overwrite local files, don't delete") - parser.add_argument('url', help="URL to the course page") - parser.add_argument('folder', nargs='?', default=None, help="Folder to put stuff into") - args = parser.parse_args() - - cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) - session = cookie_jar.create_session() - - if args.keyring: - if not args.username: - _PRETTY.error("Keyring auth selected but no --username passed!") - return - inner_auth: UserPassAuthenticator = KeyringAuthenticator( - "KIT ILIAS Shibboleth", username=args.username, password=args.password - ) - else: - inner_auth = _extract_credentials(args.credential_file) - - username, password = inner_auth.get_credentials() - authenticator = KitShibbolethAuthenticator(inner_auth) - - url = urlparse(args.url) - - crawler = IliasCrawler(url.scheme + '://' + url.netloc, session, - authenticator, lambda x, y: True) - - cookie_jar.load_cookies() - - if args.folder is None: - element_name = crawler.find_element_name(args.url) - if not element_name: - print("Error, could not get element name. Please specify a folder yourself.") - return - folder = sanitize_windows_path(Path(element_name.replace("/", "-").replace("\\", "-"))) - cookie_jar.save_cookies() - else: - folder = Path(args.folder) - - # files may not escape the pferd_root with relative paths - # note: Path(Path.cwd, Path(folder)) == Path(folder) if it is an absolute path - pferd_root = Path(Path.cwd(), Path(folder)).parent - # Folder might be a *PurePath* at this point - target = Path(folder).resolve().name - pferd = Pferd(pferd_root, test_run=args.test_run) - - def dir_filter(_: Path, element: IliasElementType) -> bool: - if args.no_videos: - return element not in [IliasElementType.VIDEO_FILE, IliasElementType.VIDEO_FOLDER] - return True - - if args.local_first: - file_confilict_resolver: FileConflictResolver = _resolve_local_first - elif args.no_delete: - file_confilict_resolver = _resolve_no_delete - elif args.remote_first: - file_confilict_resolver = _resolve_remote_first - else: - file_confilict_resolver = resolve_prompt_user - - pferd.enable_logging() - - # fetch - pferd.ilias_kit_folder( - target=target, - full_url=args.url, - cookies=args.cookies, - dir_filter=dir_filter, - username=username, - password=password, - file_conflict_resolver=file_confilict_resolver, - transform=sanitize_windows_path - ) - - pferd.print_summary() - - -if __name__ == "__main__": - main() From d44f6966c28ecfe87e9e9d7ae9c65382418db664 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 16:22:11 +0200 Subject: [PATCH 232/524] Log authentication attempts in HTTP crawler --- PFERD/crawl/http_crawler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index facc2ba..45f9df2 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -67,12 +67,18 @@ class HttpCrawler(Crawler): the request was made. This ensures that authentication is not performed needlessly. """ async with self._authentication_lock: + log.explain_topic("Authenticating") # Another thread successfully called authenticate in-between # We do not want to perform auth again, so we return here. We can # assume the other thread suceeded as authenticate will throw an error # if it failed and aborts the crawl process. if caller_auth_id != self._authentication_id: + log.explain( + "Authentication skipped due to auth id mismatch." + "A previous authentication beat us to the race." + ) return + log.explain("Calling crawler-specific authenticate") await self._authenticate() self._authentication_id += 1 # Saving the cookies after the first auth ensures we won't need to re-authenticate From 342076ee0e6da7b96e26da9caa6ef8c0b7f7bf70 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 16:22:51 +0200 Subject: [PATCH 233/524] Handle exercise detail containers in ILIAS html parser --- PFERD/crawl/ilias/kit_ilias_html.py | 65 ++++++++++++++++++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 1 + 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 61df57a..36da7d4 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -16,6 +16,7 @@ TargetType = Union[str, int] class IliasElementType(Enum): EXERCISE = "exercise" + EXERCISE_FILES = "exercise_files" # own submitted files FILE = "file" FOLDER = "folder" FORUM = "forum" @@ -197,6 +198,43 @@ class IliasPage: return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) def _find_exercise_entries(self) -> List[IliasPageElement]: + if self._soup.find(id="tab_submission"): + log.explain("Found submission tab. This is an exercise detail page") + return self._find_exercise_entries_detail_page() + log.explain("Found no submission tab. This is an exercise root page") + return self._find_exercise_entries_root_page() + + def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: + results: List[IliasPageElement] = [] + + # Find all download links in the container (this will contain all the files) + download_links: List[Tag] = self._soup.findAll( + name="a", + # download links contain the given command class + attrs={"href": lambda x: x and "cmd=download" in x}, + text="Download" + ) + + for link in download_links: + parent_row: Tag = link.findParent("tr") + children: List[Tag] = parent_row.findChildren("td") + + # + # 0 1 2 3 4 + name = _sanitize_path_name(children[1].getText().strip()) + date = demangle_date(children[3].getText().strip()) + + log.explain(f"Found exercise detail entry {name!r}") + results.append(IliasPageElement( + IliasElementType.FILE, + self._abs_url_from_link(link), + name, + date + )) + + return results + + def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: results: List[IliasPageElement] = [] # Each assignment is in an accordion container @@ -205,6 +243,8 @@ class IliasPage: for container in assignment_containers: # Fetch the container name out of the header to use it in the path container_name = container.select_one(".ilAssignmentHeader").getText().strip() + log.explain(f"Found exercise container {container_name!r}") + # Find all download links in the container (this will contain all the files) files: List[Tag] = container.findAll( name="a", @@ -213,8 +253,6 @@ class IliasPage: text="Download" ) - log.explain(f"Found exercise container {container_name!r}") - # Grab each file as you now have the link for file_link in files: # Two divs, side by side. Left is the name, right is the link ==> get left @@ -231,6 +269,25 @@ class IliasPage: None # We do not have any timestamp )) + # Find all links to file listings (e.g. "Submitted Files" for groups) + file_listings: List[Tag] = container.findAll( + name="a", + # download links contain the given command class + attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} + ) + + # Add each listing as a new + for listing in file_listings: + file_name = _sanitize_path_name(listing.getText().strip()) + url = self._abs_url_from_link(listing) + log.explain(f"Found exercise detail {file_name!r} at {url}") + results.append(IliasPageElement( + IliasElementType.EXERCISE_FILES, + url, + container_name + "/" + file_name, + None # we do not have any timestamp + )) + return results def _find_normal_entries(self) -> List[IliasPageElement]: @@ -349,7 +406,7 @@ class IliasPage: if found_parent is None: _unexpected_html_warning() - log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url!r}") + log.warn_contd(f"Tried to figure out element type, but did not find an icon for {url}") return None # Find the small descriptive icon to figure out the type @@ -357,7 +414,7 @@ class IliasPage: if img_tag is None: _unexpected_html_warning() - log.warn_contd(f"Tried to figure out element type, but did not find an image for {url!r}") + log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") return None if "opencast" in str(img_tag["alt"]).lower(): diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index d488974..11b27d1 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -61,6 +61,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, + IliasElementType.EXERCISE_FILES, IliasElementType.FOLDER, IliasElementType.MEETING, IliasElementType.VIDEO_FOLDER, From 492ec6a93239ffc1924c7a56f527c346224764f1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 16:32:29 +0200 Subject: [PATCH 234/524] Detect and skip ILIAS tests --- PFERD/crawl/ilias/kit_ilias_html.py | 9 ++++++--- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 36da7d4..7bb7084 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -17,6 +17,7 @@ TargetType = Union[str, int] class IliasElementType(Enum): EXERCISE = "exercise" EXERCISE_FILES = "exercise_files" # own submitted files + TEST = "test" # an online test. Will be ignored currently. FILE = "file" FOLDER = "folder" FORUM = "forum" @@ -373,9 +374,8 @@ class IliasPage: if "target=file_" in parsed_url.query: return IliasElementType.FILE - # Skip forums - if "cmd=showThreads" in parsed_url.query: - return IliasElementType.FORUM + # TODO: Match based on CMD_CLASS or icon? The folder_like check looks at the icon, + # but we could also match the command class. I am not sure what's more stable. # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc @@ -432,6 +432,9 @@ class IliasPage: if str(img_tag["src"]).endswith("sess.svg"): return IliasElementType.MEETING + if str(img_tag["src"]).endswith("icon_tst.svg"): + return IliasElementType.TEST + return IliasElementType.FOLDER @staticmethod diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 11b27d1..0bd3202 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -274,6 +274,10 @@ class KitIliasWebCrawler(HttpCrawler): log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Forums are not supported") log.explain("Answer: No") + elif element.type == IliasElementType.TEST: + log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") + log.explain("Tests contain no relevant files") + log.explain("Answer: No") elif element.type == IliasElementType.LINK: await self._download_link(element, element_path) elif element.type == IliasElementType.VIDEO: From 86ba47541b9217b70f686dc1e7ff84aec7a45325 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 16:53:50 +0200 Subject: [PATCH 235/524] Fix cookie loading and saving --- PFERD/crawl/http_crawler.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 45f9df2..f0370a3 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -38,7 +38,6 @@ class HttpCrawler(Crawler): self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) self._shared_cookie_jar_paths: Optional[List[Path]] = None self._shared_auth = shared_auth - self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._output_dir.register_reserved(self.COOKIE_FILE) @@ -106,6 +105,7 @@ class HttpCrawler(Crawler): def _load_cookies(self) -> None: log.explain_topic("Loading cookies") + cookie_jar_path: Optional[Path] = None if self._shared_cookie_jar_paths is None: @@ -132,32 +132,29 @@ class HttpCrawler(Crawler): log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") try: - self._current_cookie_jar = aiohttp.CookieJar() - self._current_cookie_jar.load(cookie_jar_path) + self._cookie_jar.load(cookie_jar_path) except Exception as e: log.explain("Failed to load cookies") log.explain(str(e)) def _save_cookies(self) -> None: log.explain_topic("Saving cookies") - if not self._current_cookie_jar: - log.explain("No cookie jar, save aborted") - return try: log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") - self._current_cookie_jar.save(self._cookie_jar_path) + self._cookie_jar.save(self._cookie_jar_path) except Exception as e: log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(str(e)) async def run(self) -> None: self._request_count = 0 + self._cookie_jar = aiohttp.CookieJar() self._load_cookies() async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, - cookie_jar=self._current_cookie_jar, + cookie_jar=self._cookie_jar, timeout=ClientTimeout(total=self._http_timeout) ) as session: self.session = session From d8bd1f518ab25502299b44c84375261e6a4fc888 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 15:43:53 +0200 Subject: [PATCH 236/524] Set up build and release workflow --- .github/workflows/build-and-release.yml | 78 +++++++++++++++++++++++++ scripts/setup | 11 +++- 2 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build-and-release.yml diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml new file mode 100644 index 0000000..565c4e3 --- /dev/null +++ b/.github/workflows/build-and-release.yml @@ -0,0 +1,78 @@ +name: build-and-release + +on: push + +defaults: + run: + shell: bash + +jobs: + + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python: ["3.8"] + steps: + + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Set up project + if: matrix.os != 'windows-latest' + run: ./scripts/setup + + - name: Set up project on windows + if: matrix.os == 'windows-latest' + # For some reason, `pip install --upgrade pip` doesn't work on + # 'windows-latest'. The installed pip version works fine however. + run: ./scripts/setup --no-pip + + - name: Run checks + run: ./scripts/check + + - name: Build + run: ./scripts/build + + - name: Rename binary + # Glob in source location because on windows pyinstaller creates a file + # named "pferd.exe" + run: mv dist/pferd* dist/pferd-${{ matrix.os }} + + - name: Upload binary + uses: actions/upload-artifact@v2 + with: + name: Binaries + path: dist/pferd-${{ matrix.os }} + + release: + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + needs: build + steps: + + - name: Download binaries + uses: actions/download-artifact@v2 + with: + name: Binaries + + - name: Rename binaries + run: | + mv pferd-ubuntu-latest pferd-linux + mv pferd-windows-latest pferd-windows.exe + mv pferd-macos-latest pferd-mac + + - name: Create release + uses: softprops/action-gh-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + files: | + pferd-linux + pferd-windows.exe + pferd-mac diff --git a/scripts/setup b/scripts/setup index 46fe2f8..b48fb1a 100755 --- a/scripts/setup +++ b/scripts/setup @@ -2,7 +2,16 @@ set -e -pip install --upgrade pip setuptools +# Updating pip and setuptools because some older versions don't recognize the +# project setup correctly +if [[ $1 != '--no-pip' ]]; then + pip install --upgrade pip +fi +pip install --upgrade setuptools + +# Installing PFERD itself pip install --editable . + +# Installing various tools pip install --upgrade mypy flake8 autopep8 isort pip install --upgrade pyinstaller From 1739c5409158663fb27b9ed8bd1d595f8d9c77ec Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 17:50:17 +0200 Subject: [PATCH 237/524] Add checklist for releasing new versions --- DEV.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/DEV.md b/DEV.md index 212cec8..d530d1f 100644 --- a/DEV.md +++ b/DEV.md @@ -24,6 +24,9 @@ installs PFERD via `pip install --editable .`, which means that you can just run `--editable`, there is no need to re-run `pip install` when the source code is changed. +If you get any errors because pip can't update itself, try running +`./scripts/setup --no-pip` instead of `./scripts/setup`. + For more details, see [this part of the Python Tutorial][venv-tut] and [this section on "development mode"][ppug-dev]. @@ -50,3 +53,33 @@ that `./scripts/check` returns no warnings and the code has been run through `./scripts/format`. In your first PR, please add your name to the `LICENSE` file. + +## Releasing a new version + +1. Update the version number in `PFERD/version.py` +2. Update `CHANGELOG.md` +3. Commit changes to `master` with message `Bump version to ` (e. g. `Bump version to 3.2.5`) +4. Create annotated tag named `v` (e. g. `v3.2.5`) + - Copy changes from changelog + - Remove `#` symbols (which git would interpret as comments) + - As the first line, add `Version - ` (e. g. `Version 3.2.5 - 2021-05-24`) + - Leave the second line empty +5. Fast-forward `latest` to `master` +6. Push `master`, `latest` and the new tag + +Example tag annotation: +``` +Version 3.2.5 - 2021-05-24 + +Added +- Support for concurrent downloads +- Support for proper config files +- This changelog + +Changed +- Rewrote almost everything +- Redesigned CLI + +Removed +- Backwards compatibility with 2.x +``` From 9ce20216b5f11949be7c818c3f78d956bb5e7162 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 18:32:18 +0200 Subject: [PATCH 238/524] Do not set a timeout for whole HTTP request Downloads might take longer! --- PFERD/crawl/http_crawler.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index f0370a3..177972b 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -155,7 +155,15 @@ class HttpCrawler(Crawler): async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, cookie_jar=self._cookie_jar, - timeout=ClientTimeout(total=self._http_timeout) + timeout=ClientTimeout( + # 30 minutes. No download in the history of downloads was longer than 30 minutes. + # This is enough to transfer a 600 MB file over a 3 Mib/s connection. + # Allowing an arbitrary value could be annoying for overnight batch jobs + total=15 * 60, + connect=self._http_timeout, + sock_connect=self._http_timeout, + sock_read=self._http_timeout, + ) ) as session: self.session = session try: From 85f89a7ff36aba8a82c94226c6829813a7b26e9e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 May 2021 18:53:00 +0200 Subject: [PATCH 239/524] Interpret accordions and expandable headers as virtual folders This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test". --- PFERD/crawl/ilias/kit_ilias_html.py | 50 +++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7bb7084..43d66b5 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -299,7 +299,13 @@ class IliasPage: for link in links: abs_url = self._abs_url_from_link(link) - element_name = _sanitize_path_name(link.getText()) + parents = self._find_upwards_folder_hierarchy(link) + + if parents: + element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText()) + else: + element_name = _sanitize_path_name(link.getText()) + element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) @@ -318,6 +324,47 @@ class IliasPage: return result + def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: + """ + Interprets accordions and expandable blocks as virtual folders and returns them + in order. This allows us to find a file named "Test" in an accordion "Acc" as "Acc/Test" + """ + found_titles = [] + + outer_accordion_content: Optional[Tag] = None + + parents: List[Tag] = list(tag.parents) + for parent in parents: + if not parent.get("class"): + continue + + # ILIAS has proper accordions and weird blocks that look like normal headings, + # but some JS later transforms them into an accordion. + + # This is for these weird JS-y blocks + if "ilContainerItemsContainer" in parent.get("class"): + # I am currently under the impression that *only* those JS blocks have an + # ilNoDisplay class. + if "ilNoDisplay" not in parent.get("class"): + continue + prev: Tag = parent.findPreviousSibling("div") + if "ilContainerBlockHeader" in prev.get("class"): + found_titles.append(prev.find("h3").getText().strip()) + + # And this for real accordions + if "il_VAccordionContentDef" in parent.get("class"): + outer_accordion_content = parent + break + + if outer_accordion_content: + accordion_tag: Tag = outer_accordion_content.parent + head_tag: Tag = accordion_tag.find(attrs={ + "class": lambda x: x and "ilc_va_ihead_VAccordIHead" in x + }) + found_titles.append(head_tag.getText().strip()) + + return [_sanitize_path_name(x) for x in reversed(found_titles)] + def _find_link_description(self, link: Tag) -> Optional[str]: tile: Tag = link.findParent("div", {"class": lambda x: x and "il_ContainerListItem" in x}) if not tile: @@ -353,7 +400,6 @@ class IliasPage: modification_date = demangle_date(modification_date_str) # Grab the name from the link text - name = _sanitize_path_name(link_element.getText()) full_path = name + "." + file_type log.explain(f"Found file {full_path!r}") From c33de233dcfd9f943cbbaf1deb5cb65871b5bd67 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 20:08:49 +0200 Subject: [PATCH 240/524] Add script for releasing new versions --- CHANGELOG.md | 2 +- DEV.md | 4 ++ scripts/bump-version | 111 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100755 scripts/bump-version diff --git a/CHANGELOG.md b/CHANGELOG.md index 14966d7..519c046 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## Unreleased ### Added - Support for concurrent downloads diff --git a/DEV.md b/DEV.md index d530d1f..f577b93 100644 --- a/DEV.md +++ b/DEV.md @@ -56,6 +56,10 @@ In your first PR, please add your name to the `LICENSE` file. ## Releasing a new version +This section describes the steps required to release a new version of PFERD. +Usually, they don't need to performed manually and `scripts/bump-version` can be +used instead. + 1. Update the version number in `PFERD/version.py` 2. Update `CHANGELOG.md` 3. Commit changes to `master` with message `Bump version to ` (e. g. `Bump version to 3.2.5`) diff --git a/scripts/bump-version b/scripts/bump-version new file mode 100755 index 0000000..4479ef8 --- /dev/null +++ b/scripts/bump-version @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +import argparse +import time +import re +from subprocess import run + + +def load_changelog(): + with open("CHANGELOG.md") as f: + return list(f) + + +def extract_changes(lines): + lines = iter(lines) + changes = [] + + # Find "Unreleased" section + for line in lines: + if line.strip() == "## Unreleased": + break + next(lines) + + # Read all lines from that section + for line in lines: + if line.startswith("## "): + # Found the beginning of the next section + break + elif line.startswith("### "): + # Found a heading in the current section + # Remove "#" symbols so git doesn't interpret the line as a comment later + changes.append(line[4:]) + else: + changes.append(line) + + # Remove trailing empty lines + while changes and not changes[-1].strip(): + changes.pop() + + return changes + + +def update_version(version): + with open("PFERD/version.py") as f: + text = f.read() + + text = re.sub(r'VERSION = ".*"', f'VERSION = "{version}"', text) + + with open("PFERD/version.py", "w") as f: + f.write(text) + + +def update_changelog(lines, version, date): + lines = iter(lines) + new_lines = [] + + # Find "Unreleased" section + for line in lines: + new_lines.append(line) + if line.strip() == "## Unreleased": + break + + # Add new heading below that + new_lines.append("\n") + new_lines.append(f"## {version} - {date}\n") + + # Add remaining lines + for line in lines: + new_lines.append(line) + + with open("CHANGELOG.md", "w") as f: + f.write("".join(new_lines)) + + +def commit_changes(version): + run(["git", "add", "CHANGELOG.md", "PFERD/version.py"]) + run(["git", "commit", "-m", f"Bump version to {version}"]) + + +def create_tag(version, annotation): + run(["git", "tag", "-am", annotation, f"v{version}"]) + + +def fastforward_latest(): + run(["git", "branch", "-f", "latest", "HEAD"]) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("version") + args = parser.parse_args() + + version = args.version + date = time.strftime("%Y-%m-%d") + changelog = load_changelog() + changes = extract_changes(changelog) + annotation = f"Version {version} - {date}\n\n{''.join(changes)}" + + update_version(version) + update_changelog(changelog, version, date) + commit_changes(version) + create_tag(version, annotation) + fastforward_latest() + + print() + print("Now the only thing left is to publish the changes:") + print(f" $ git push origin master latest v{version}") + + +if __name__ == "__main__": + main() From 69cb2a77340bf1dc0dbce3967876be51e9917ebc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 11:33:45 +0200 Subject: [PATCH 241/524] Add Links option to ilias crawler This allows you to configure what type the link files should have and whether to create them at all. --- PFERD/crawl/ilias/file_templates.py | 40 ++++++++++++++++++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++++++++--- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index e9e332e..1385ba4 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,5 +1,8 @@ -link_template_plain = "{{link}}" -link_template_rich = """ +from enum import Enum +from typing import Optional + +_link_template_plain = "{{link}}" +_link_template_fancy = """ @@ -84,4 +87,35 @@ link_template_rich = """ -""" # noqa: E501 line too long +""".strip() # noqa: E501 line too long + +_link_template_internet_shortcut = """ +[InternetShortcut] +URL={{link}} +""".strip() + + +class Links(Enum): + IGNORE = "ignore" + PLAIN = "plain" + FANCY = "fancy" + INTERNET_SHORTCUT = "internet-shortcut" + + def template(self) -> Optional[str]: + if self == self.FANCY: + return _link_template_fancy + elif self == self.PLAIN: + return _link_template_plain + elif self == self.INTERNET_SHORTCUT: + return _link_template_internet_shortcut + elif self == self.IGNORE: + return None + raise ValueError("Missing switch case") + + @staticmethod + def from_string(string: str) -> "Links": + try: + return Links(string) + except ValueError: + raise ValueError("must be one of 'ignore', 'plain'," + " 'html', 'internet-shortcut'") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 0bd3202..283289e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -14,7 +14,7 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import link_template_plain, link_template_rich +from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names TargetType = Union[str, int] @@ -52,8 +52,16 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def link_file_redirect_delay(self) -> int: return self.s.getint("link_file_redirect_delay", fallback=-1) - def link_file_use_plaintext(self) -> bool: - return self.s.getboolean("link_file_plaintext", fallback=False) + def links(self) -> Links: + type_str: Optional[str] = self.s.get("links") + + if type_str is None: + return Links.FANCY + + try: + return Links.from_string(type_str) + except ValueError as e: + self.invalid_value("links", type_str, str(e).capitalize()) def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) @@ -166,7 +174,7 @@ class KitIliasWebCrawler(HttpCrawler): self._target = section.target() self._link_file_redirect_delay = section.link_file_redirect_delay() - self._link_file_use_plaintext = section.link_file_use_plaintext() + self._links = section.links() self._videos = section.videos() async def _run(self) -> None: @@ -292,6 +300,17 @@ class KitIliasWebCrawler(HttpCrawler): raise CrawlWarning(f"Unknown element type: {element.type!r}") async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") + log.explain(f"Links type is {self._links}") + + link_template_maybe = self._links.template() + if not link_template_maybe: + log.explain("Answer: No") + return + else: + log.explain("Answer: Yes") + link_template = link_template_maybe + maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return @@ -303,7 +322,7 @@ class KitIliasWebCrawler(HttpCrawler): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) - content = link_template_plain if self._link_file_use_plaintext else link_template_rich + content = link_template content = content.replace("{{link}}", real_url) content = content.replace("{{name}}", element.name) content = content.replace("{{description}}", str(element.description)) From ffda4e43df80961990b65bf99c9a96f66f7566b3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 11:40:41 +0200 Subject: [PATCH 242/524] Add extension to link files --- PFERD/crawl/ilias/file_templates.py | 11 +++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 1385ba4..6f2b1cd 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -112,6 +112,17 @@ class Links(Enum): return None raise ValueError("Missing switch case") + def extension(self) -> Optional[str]: + if self == self.FANCY: + return ".html" + elif self == self.PLAIN: + return ".txt" + elif self == self.INTERNET_SHORTCUT: + return ".url" + elif self == self.IGNORE: + return None + raise ValueError("Missing switch case") + @staticmethod def from_string(string: str) -> "Links": try: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 283289e..318fa5e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -304,12 +304,14 @@ class KitIliasWebCrawler(HttpCrawler): log.explain(f"Links type is {self._links}") link_template_maybe = self._links.template() - if not link_template_maybe: + link_extension = self._links.extension() + if not link_template_maybe or not link_extension: log.explain("Answer: No") return else: log.explain("Answer: Yes") link_template = link_template_maybe + element_path = element_path.with_name(element_path.name + link_extension) maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: From 4fefb98d719c0395a48c766b5833ed8baef913f8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 11:57:59 +0200 Subject: [PATCH 243/524] Add a wrapper to pretty-print ValueErrors in argparse parsers --- PFERD/cli/parser.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 72abb76..f26390c 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -1,7 +1,8 @@ import argparse import configparser +from argparse import ArgumentTypeError from pathlib import Path -from typing import Any, List, Optional, Sequence, Union +from typing import Any, Callable, List, Optional, Sequence, Union from ..output_dir import OnConflict, Redownload from ..version import NAME, VERSION @@ -57,6 +58,19 @@ class BooleanOptionalAction(argparse.Action): return "--[no-]" + self.name[2:] +def show_value_error(inner: Callable[[str], Any]) -> Callable[[str], Any]: + """ + Some validation functions (like the from_string in our enums) raise a ValueError. + Argparse only pretty-prints ArgumentTypeErrors though, so we need to wrap our ValueErrors. + """ + def wrapper(input: str) -> Any: + try: + return inner(input) + except ValueError as e: + raise ArgumentTypeError(e) + return wrapper + + CRAWLER_PARSER = argparse.ArgumentParser(add_help=False) CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( title="general crawler arguments", @@ -64,13 +78,13 @@ CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( ) CRAWLER_PARSER_GROUP.add_argument( "--redownload", - type=Redownload.from_string, + type=show_value_error(Redownload.from_string), metavar="OPTION", help="when to redownload a file that's already present locally" ) CRAWLER_PARSER_GROUP.add_argument( "--on-conflict", - type=OnConflict.from_string, + type=show_value_error(OnConflict.from_string), metavar="OPTION", help="what to do when local and remote files or directories differ" ) From c21ddf225b4da61787c651df39b1bb90e6815b51 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 11:58:41 +0200 Subject: [PATCH 244/524] Add a CLI option to configure ILIAS links behaviour --- PFERD/cli/command_kit_ilias_web.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 89da390..8323c5c 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -2,7 +2,8 @@ import argparse import configparser from pathlib import Path -from .parser import CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, load_crawler +from ..crawl.ilias.file_templates import Links +from .parser import CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, load_crawler, show_value_error SUBPARSER = SUBPARSERS.add_parser( "kit-ilias-web", @@ -41,17 +42,18 @@ GROUP.add_argument( action=BooleanOptionalAction, help="use the system keyring to store and retrieve passwords" ) +GROUP.add_argument( + "--links", + type=show_value_error(Links.from_string), + metavar="OPTION", + help="how to treat external links" +) GROUP.add_argument( "--link-file-redirect-delay", type=int, metavar="SECONDS", help="delay before external link files redirect you to their target (-1 to disable)" ) -GROUP.add_argument( - "--link-file-plaintext", - action=BooleanOptionalAction, - help="use plain text files for external links" -) GROUP.add_argument( "--http-timeout", type=float, @@ -74,8 +76,8 @@ def load( section["auth"] = "auth:kit-ilias-web" if args.link_file_redirect_delay is not None: section["link_file_redirect_delay"] = str(args.link_file_redirect_delay) - if args.link_file_plaintext is not None: - section["link_file_plaintext"] = str(args.link_file_plaintext) + if args.links is not None: + section["links"] = str(args.links.value) if args.videos is not None: section["videos"] = str(False) if args.http_timeout is not None: From bce3dc384d82763f0836c5c236b930fb9d8ce75d Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 11:58:01 +0200 Subject: [PATCH 245/524] Deduplicate path names in crawler Also rename files so they follow the restrictions for windows file names if we're on windows. --- CONFIG.md | 2 + PFERD/crawl/crawler.py | 18 ++++++++ PFERD/crawl/local_crawler.py | 2 +- PFERD/deduplicator.py | 79 ++++++++++++++++++++++++++++++++++++ PFERD/output_dir.py | 12 +++++- PFERD/report.py | 3 ++ 6 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 PFERD/deduplicator.py diff --git a/CONFIG.md b/CONFIG.md index 7e8a717..982f4fc 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -83,6 +83,8 @@ crawlers: - `delay_between_tasks`: Time (in seconds) that the crawler should wait between subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary load for the crawl target. (Default: 0.0) +- `windows_paths`: Whether PFERD should find alternative names for paths that + are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) Some crawlers may also require credentials for authentication. To configure how the crawler obtains its credentials, the `auth` option is used. It is set to the diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d6d4abc..8bd29ad 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -1,4 +1,5 @@ import asyncio +import os from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path, PurePath @@ -8,6 +9,7 @@ from rich.markup import escape from ..auth import Authenticator from ..config import Config, Section +from ..deduplicator import Deduplicator from ..limiter import Limiter from ..logging import ProgressBar, log from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload @@ -97,6 +99,10 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]): self._limiter = limiter self._path = path + @property + def path(self) -> PurePath: + return self._path + async def _on_aenter(self) -> ProgressBar: bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" @@ -116,6 +122,10 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): self._fs_token = fs_token self._path = path + @property + def path(self) -> PurePath: + return self._path + async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" # The "Downloaded ..." message is printed in the output dir, not here @@ -195,6 +205,10 @@ class CrawlerSection(Section): self.invalid_value("auth", value, "No such auth section exists") return auth + def windows_paths(self) -> bool: + on_windows = os.name == "nt" + return self.s.getboolean("windows_paths", fallback=on_windows) + class Crawler(ABC): def __init__( @@ -221,12 +235,14 @@ class Crawler(ABC): task_delay=section.delay_between_tasks(), ) + self._deduplicator = Deduplicator(section.windows_paths()) self._transformer = Transformer(section.transform()) self._output_dir = OutputDirectory( config.default_section.working_dir() / section.output_dir(name), section.redownload(), section.on_conflict(), + section.windows_paths(), ) @property @@ -258,6 +274,7 @@ class Crawler(ABC): async def crawl(self, path: PurePath) -> Optional[CrawlToken]: log.explain_topic(f"Decision: Crawl {fmt_path(path)}") + path = self._deduplicator.mark(path) if self._transformer.transform(path) is None: log.explain("Answer: No") @@ -274,6 +291,7 @@ class Crawler(ABC): on_conflict: Optional[OnConflict] = None, ) -> Optional[DownloadToken]: log.explain_topic(f"Decision: Download {fmt_path(path)}") + path = self._deduplicator.mark(path) transformed_path = self._transformer.transform(path) if transformed_path is None: diff --git a/PFERD/crawl/local_crawler.py b/PFERD/crawl/local_crawler.py index 7958169..f102bc9 100644 --- a/PFERD/crawl/local_crawler.py +++ b/PFERD/crawl/local_crawler.py @@ -80,7 +80,7 @@ class LocalCrawler(Crawler): )) for child in path.iterdir(): - pure_child = pure / child.name + pure_child = cl.path / child.name tasks.append(self._crawl_path(child, pure_child)) await self.gather(tasks) diff --git a/PFERD/deduplicator.py b/PFERD/deduplicator.py new file mode 100644 index 0000000..ef62dcb --- /dev/null +++ b/PFERD/deduplicator.py @@ -0,0 +1,79 @@ +from pathlib import PurePath +from typing import Iterator, Set + +from .logging import log +from .utils import fmt_path + + +def name_variants(path: PurePath) -> Iterator[PurePath]: + separator = " " if " " in path.stem else "_" + i = 1 + while True: + yield path.parent / f"{path.stem}{separator}{i}{path.suffix}" + i += 1 + + +class Deduplicator: + FORBIDDEN_CHARS = '<>:"/\\|?*' + FORBIDDEN_NAMES = { + "CON", "PRN", "AUX", "NUL", + "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", + "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", + } + + def __init__(self, windows_paths: bool) -> None: + self._windows_paths = windows_paths + + self._known: Set[PurePath] = set() + + def _add(self, path: PurePath) -> None: + self._known.add(path) + + # The last parent is just "." + for parent in list(path.parents)[:-1]: + self._known.add(parent) + + def _fixup_element(self, name: str) -> str: + # For historical reasons, windows paths have some odd restrictions that + # we're trying to avoid. See: + # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file + + for char in self.FORBIDDEN_CHARS: + name = name.replace(char, "_") + + path = PurePath(name) + if path.stem in self.FORBIDDEN_NAMES: + name = f"{path.stem}_{path.suffix}" + + if name.endswith(" ") or name.endswith("."): + name += "_" + + return name + + def _fixup_for_windows(self, path: PurePath) -> PurePath: + new_path = PurePath(*[self._fixup_element(elem) for elem in path.parts]) + if new_path != path: + log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") + return new_path + + def mark(self, path: PurePath) -> PurePath: + if self._windows_paths: + path = self._fixup_for_windows(path) + + if path not in self._known: + self._add(path) + return path + + log.explain(f"Path {fmt_path(path)} is already taken, finding a new name") + + for variant in name_variants(path): + if variant in self._known: + log.explain(f"Path {fmt_path(variant)} is taken as well") + continue + + log.explain(f"Found unused path {fmt_path(variant)}") + self._add(variant) + return variant + + # The "name_variants" iterator returns infinitely many paths + raise RuntimeError("Unreachable") diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 17cb772..5f65316 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -142,8 +142,17 @@ class OutputDirectory: root: Path, redownload: Redownload, on_conflict: OnConflict, + windows_paths: bool, ): - self._root = root + if windows_paths: + # Windows limits the path length to 260 for some historical reason + # If you want longer paths, you will have to add the "\\?\" prefix + # in front of your path. See: + # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation + self._root = Path("\\\\?\\" + str(root)) + else: + self._root = root + self._redownload = redownload self._on_conflict = on_conflict @@ -181,6 +190,7 @@ class OutputDirectory: raise OutputDirError(f"Forbidden segment '..' in path {fmt_path(path)}") if "." in path.parts: raise OutputDirError(f"Forbidden segment '.' in path {fmt_path(path)}") + return self._root / path def _should_download( diff --git a/PFERD/report.py b/PFERD/report.py index 619c621..4f15237 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -114,6 +114,9 @@ class Report: f.write("\n") # json.dump doesn't do this def mark_reserved(self, path: PurePath) -> None: + if path in self.marked: + raise RuntimeError("Trying to reserve an already reserved file") + self.reserved_files.add(path) def mark(self, path: PurePath) -> None: From 651b0879320500927a13f732b0bc070afbfa3ac2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 12:15:38 +0200 Subject: [PATCH 246/524] Use cl/dl deduplication mechanism for ILIAS crawler --- PFERD/crawl/ilias/kit_ilias_html.py | 52 ---------------------- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +-- 2 files changed, 2 insertions(+), 56 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 43d66b5..032bb27 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -567,55 +567,3 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() - - -def deduplicate_element_names(elements: List[IliasPageElement]) -> List[IliasPageElement]: - """ - De-duplicates element names by appending an incrementing number to later elements: - test.pdf - test.pdf - would result in - test.pdf - test_1.pdf - - It is also space-aware: - "te st.pdf" - "te st.pdf" - would result in - "tes st.pdf" - "tes st 1.pdf" - """ - known_names = dict() - result_elements = [] - - for element in elements: - # This file is new - add it and mark its name as used - if element.name not in known_names: - known_names[element.name] = 1 - result_elements.append(element) - continue - - # This file is a duplicate. Find a suitable suffix - current_counter = known_names[element.name] - adjusted_element = _append_number(element, current_counter) - # increment the counter so the next duplicate does not conflict - known_names[element.name] += 1 - # also block the new name, so another file with the *renamed* name gets renamed as well - known_names[adjusted_element.name] = 1 - - result_elements.append(adjusted_element) - - return result_elements - - -def _append_number(element: IliasPageElement, number: int) -> IliasPageElement: - extension_index = element.name.rfind(".") - suffix = f" {number}" if " " in element.name else f"_{number}" - if extension_index < 0: - new_name = element.name + suffix - else: - new_name = element.name[:extension_index] + suffix + element.name[extension_index:] - - return IliasPageElement( - element.type, element.url, new_name, element.mtime, element.description - ) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 318fa5e..daafc12 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -15,7 +15,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names +from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -226,7 +226,6 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() - elements = deduplicate_element_names(elements) tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] # And execute them @@ -253,8 +252,7 @@ class KitIliasWebCrawler(HttpCrawler): # Fill up our task list with the found elements await gather_elements() - elements = deduplicate_element_names(elements) - tasks = [self._handle_ilias_element(path, element) for element in elements] + tasks = [self._handle_ilias_element(cl.path, element) for element in elements] # And execute them await self.gather(tasks) From 16d50b6626a1727edebdf8b2d309ce4b5ab702e9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 13:31:29 +0200 Subject: [PATCH 247/524] Document why /pferd.py exists --- pferd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pferd.py b/pferd.py index 21857f4..dfea7c2 100644 --- a/pferd.py +++ b/pferd.py @@ -1,3 +1,5 @@ +# File used by pyinstaller to create the executable + from PFERD.__main__ import main if __name__ == "__main__": From c15a1aecdfd6a12c80e243c5c12588845de3dea0 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 13:32:00 +0200 Subject: [PATCH 248/524] Rename keyring authenticator file for consistency --- PFERD/auth/__init__.py | 2 +- PFERD/auth/{keyring_authenticator.py => keyring.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename PFERD/auth/{keyring_authenticator.py => keyring.py} (100%) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 81ec31d..04ad587 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -3,7 +3,7 @@ from typing import Callable, Dict from ..config import Config from .authenticator import Authenticator, AuthSection -from .keyring_authenticator import KeyringAuthenticator, KeyringAuthSection +from .keyring import KeyringAuthenticator, KeyringAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator diff --git a/PFERD/auth/keyring_authenticator.py b/PFERD/auth/keyring.py similarity index 100% rename from PFERD/auth/keyring_authenticator.py rename to PFERD/auth/keyring.py From 22c2259adbf25bfc26bb312f1f91380f1a5461da Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 14:21:12 +0200 Subject: [PATCH 249/524] Clean up authenticator exceptions - Renamed to *Error for consistency - Treating AuthError like CrawlError --- PFERD/auth/__init__.py | 2 +- PFERD/auth/authenticator.py | 12 ++++++------ PFERD/auth/keyring.py | 4 ++-- PFERD/auth/simple.py | 8 ++++---- PFERD/auth/tfa.py | 8 ++++---- PFERD/pferd.py | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 04ad587..af38859 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -2,7 +2,7 @@ from configparser import SectionProxy from typing import Callable, Dict from ..config import Config -from .authenticator import Authenticator, AuthSection +from .authenticator import Authenticator, AuthError, AuthSection # noqa: F401 from .keyring import KeyringAuthenticator, KeyringAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index 9217dcd..5f09f92 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -4,11 +4,11 @@ from typing import Tuple from ..config import Config, Section -class AuthLoadException(Exception): +class AuthLoadError(Exception): pass -class AuthException(Exception): +class AuthError(Exception): pass @@ -30,7 +30,7 @@ class Authenticator(ABC): If you are writing your own constructor for your own authenticator, make sure to call this constructor first (via super().__init__). - May throw an AuthLoadException. + May throw an AuthLoadError. """ self.name = name @@ -56,7 +56,7 @@ class Authenticator(ABC): (e. g. prompting the user). """ - raise AuthException("Invalid credentials") + raise AuthError("Invalid credentials") def invalidate_username(self) -> None: """ @@ -67,7 +67,7 @@ class Authenticator(ABC): (e. g. prompting the user). """ - raise AuthException("Invalid username") + raise AuthError("Invalid username") def invalidate_password(self) -> None: """ @@ -78,4 +78,4 @@ class Authenticator(ABC): (e. g. prompting the user). """ - raise AuthException("Invalid password") + raise AuthError("Invalid password") diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index 413c7ad..326f629 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -6,7 +6,7 @@ from ..config import Config from ..logging import log from ..utils import agetpass from ..version import NAME -from .authenticator import Authenticator, AuthException, AuthSection +from .authenticator import Authenticator, AuthError, AuthSection class KeyringAuthSection(AuthSection): @@ -53,4 +53,4 @@ class KeyringAuthenticator(Authenticator): self.invalidate_password() def invalidate_password(self) -> None: - raise AuthException("Invalid password") + raise AuthError("Invalid password") diff --git a/PFERD/auth/simple.py b/PFERD/auth/simple.py index a12c359..7fbb60b 100644 --- a/PFERD/auth/simple.py +++ b/PFERD/auth/simple.py @@ -3,7 +3,7 @@ from typing import Optional, Tuple from ..config import Config from ..logging import log from ..utils import agetpass, ainput -from .authenticator import Authenticator, AuthException, AuthSection +from .authenticator import Authenticator, AuthError, AuthSection class SimpleAuthSection(AuthSection): @@ -48,7 +48,7 @@ class SimpleAuthenticator(Authenticator): def invalidate_credentials(self) -> None: if self._username_fixed and self._password_fixed: - raise AuthException("Configured credentials are invalid") + raise AuthError("Configured credentials are invalid") if not self._username_fixed: self._username = None @@ -57,12 +57,12 @@ class SimpleAuthenticator(Authenticator): def invalidate_username(self) -> None: if self._username_fixed: - raise AuthException("Configured username is invalid") + raise AuthError("Configured username is invalid") else: self._username = None def invalidate_password(self) -> None: if self._password_fixed: - raise AuthException("Configured password is invalid") + raise AuthError("Configured password is invalid") else: self._password = None diff --git a/PFERD/auth/tfa.py b/PFERD/auth/tfa.py index 670626d..3efabe1 100644 --- a/PFERD/auth/tfa.py +++ b/PFERD/auth/tfa.py @@ -3,7 +3,7 @@ from typing import Tuple from ..config import Config from ..logging import log from ..utils import ainput -from .authenticator import Authenticator, AuthException, AuthSection +from .authenticator import Authenticator, AuthError, AuthSection class TfaAuthenticator(Authenticator): @@ -16,7 +16,7 @@ class TfaAuthenticator(Authenticator): super().__init__(name, section, config) async def username(self) -> str: - raise AuthException("TFA authenticator does not support usernames") + raise AuthError("TFA authenticator does not support usernames") async def password(self) -> str: async with log.exclusive_output(): @@ -24,10 +24,10 @@ class TfaAuthenticator(Authenticator): return code async def credentials(self) -> Tuple[str, str]: - raise AuthException("TFA authenticator does not support usernames") + raise AuthError("TFA authenticator does not support usernames") def invalidate_username(self) -> None: - raise AuthException("TFA authenticator does not support usernames") + raise AuthError("TFA authenticator does not support usernames") def invalidate_password(self) -> None: pass diff --git a/PFERD/pferd.py b/PFERD/pferd.py index bed7c66..dbb8983 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional from rich.markup import escape -from .auth import AUTHENTICATORS, Authenticator +from .auth import AUTHENTICATORS, Authenticator, AuthError from .config import Config, ConfigOptionError from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler from .logging import log @@ -117,7 +117,7 @@ class Pferd: try: await crawler.run() - except CrawlError as e: + except (CrawlError, AuthError) as e: log.error(str(e)) except Exception: log.unexpected_exception() From eb8b91581386f4d9dbd14b685c875f3376b29162 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 14:21:52 +0200 Subject: [PATCH 250/524] Fix path prefix on windows Previously, the path prefix was only set if "windows_paths" was true, regardless of OS. Now the path prefix is always set on windows and never set on other OSes. --- PFERD/crawl/crawler.py | 1 - PFERD/output_dir.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 8bd29ad..420d088 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -242,7 +242,6 @@ class Crawler(ABC): config.default_section.working_dir() / section.output_dir(name), section.redownload(), section.on_conflict(), - section.windows_paths(), ) @property diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 5f65316..fa7babe 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -142,10 +142,9 @@ class OutputDirectory: root: Path, redownload: Redownload, on_conflict: OnConflict, - windows_paths: bool, ): - if windows_paths: - # Windows limits the path length to 260 for some historical reason + if os.name == "nt": + # Windows limits the path length to 260 for some historical reason. # If you want longer paths, you will have to add the "\\?\" prefix # in front of your path. See: # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation From 61430c8739b5789596a7b8da085eca3b37b3ec83 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 14:12:19 +0200 Subject: [PATCH 251/524] Overhaul config and CLI option names --- CONFIG.md | 138 ++++++++++++--------- PFERD/cli/command_kit_ilias_web.py | 53 ++++---- PFERD/cli/parser.py | 35 ++++-- PFERD/crawl/crawler.py | 40 +++--- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 17 ++- 5 files changed, 154 insertions(+), 129 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 982f4fc..2f2dbbe 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -1,10 +1,11 @@ # Config file format A config file consists of sections. A section begins with a `[section]` header, -which is followed by a list of `key = value` or `key: value` pairs. Comments -must be on their own line and start with `#` or `;`. Multiline values must be -indented beyond their key. For more details and some examples on the format, see -the [configparser documentation][1] ([basic interpolation][2] is enabled). +which is followed by a list of `key = value` pairs. Comments must be on their +own line and start with `#`. Multiline values must be indented beyond their key. +Boolean values can be `yes` or `no`. For more details and some examples on the +format, see the [configparser documentation][1] ([basic interpolation][2] is +enabled). [1]: "Supported INI File Structure" [2]: "BasicInterpolation" @@ -15,21 +16,19 @@ This section contains global configuration values. It can also be used to set default values for the other sections. - `working_dir`: The directory PFERD operates in. Set to an absolute path to - make PFERD operate the same regardless of where it is executed. All other + make PFERD operate the same regardless of where it is executed from. All other paths in the config file are interpreted relative to this path. If this path is relative, it is interpreted relative to the script's working dir. `~` is expanded to the current user's home directory. (Default: `.`) - `explain`: Whether PFERD should log and explain its actions and decisions in detail. (Default: `no`) -- `status`: Whether PFERD should print status updates while crawling. (Default: - `yes`) +- `status`: Whether PFERD should print status updates (like `Crawled ...`, + `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) -- `share_cookies`: Whether crawlers should share cookies where applicable. By - default, crawlers are isolated and don't interact with each other. This - includes their cookies. However, in situations where multiple crawlers crawl - the same website using the same account, sharing cookies between crawlers can - make sense. (Default: `yes`) +- `share_cookies`: Whether crawlers should share cookies where applicable. For + example, some crawlers share cookies if they crawl the same website using the + same account. (Default: `yes`) ## The `crawl:*` sections @@ -42,17 +41,17 @@ courses or lecture websites. Each crawl section represents an instance of a specific type of crawler. The `type` option is used to specify the crawler type. The crawler's name is usually -used as the name for the output directory. New crawlers can be created simply by -adding a new crawl section to the config file. +used as the output directory. New crawlers can be created simply by adding a new +crawl section to the config file. Depending on a crawler's type, it may have different options. For more details, -see the type's documentation below. The following options are common to all -crawlers: +see the type's [documentation](#crawler-types) below. The following options are +common to all crawlers: -- `type`: The types are specified in [this section](#crawler-types). +- `type`: The available types are specified in [this section](#crawler-types). - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: the crawler's name) -- `redownload`: When to download again a file that is already present locally. +- `redownload`: When to download a file that is already present locally. (Default: `never-smart`) - `never`: If a file is present locally, it is not downloaded again. - `never-smart`: Like `never`, but PFERD tries to detect if an already @@ -62,8 +61,8 @@ crawlers: - `always-smart`: Like `always`, but PFERD tries to avoid unnecessary downloads via some (unreliable) heuristics. - `on_conflict`: What to do when the local and remote versions of a file or - directory differ. Includes the cases where a file is replaced by a directory - or a directory by a file. (Default: `prompt`) + directory differ, including when a file is replaced by a directory or a + directory by a file. (Default: `prompt`) - `prompt`: Always ask the user before overwriting or deleting local files and directories. - `local-first`: Always keep the local file or directory. Equivalent to @@ -75,14 +74,13 @@ crawlers: remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) -- `max_concurrent_tasks`: The maximum number of concurrent tasks (such as - crawling or downloading). (Default: 1) -- `max_concurrent_downloads`: How many of those tasks can be download tasks at - the same time. Must not be greater than `max_concurrent_tasks`. When not set, - this is the same as `max_concurrent_tasks`. (Optional) -- `delay_between_tasks`: Time (in seconds) that the crawler should wait between +- `tasks`: The maximum number of concurrent tasks (such as crawling or + downloading). (Default: `1`) +- `downloads`: How many of those tasks can be download tasks at the same time. + Must not be greater than `tasks`. (Default: Same as `tasks`) +- `task_delay`: Time (in seconds) that the crawler should wait between subsequent tasks. Can be used as a sort of rate limit to avoid unnecessary - load for the crawl target. (Default: 0.0) + load for the crawl target. (Default: `0.0`) - `windows_paths`: Whether PFERD should find alternative names for paths that are invalid on Windows. (Default: `yes` on Windows, `no` otherwise) @@ -101,6 +99,8 @@ password = bar [crawl:something] type = some-complex-crawler auth = auth:example +on_conflict = no-delete +tasks = 3 ``` ## The `auth:*` sections @@ -109,12 +109,12 @@ Sections whose names start with `auth:` are used to configure authenticators. An authenticator provides a username and a password to one or more crawlers. Authenticators work similar to crawlers: A section represents an authenticator -instance, whose name is the rest of the section name. The type is specified by +instance whose name is the rest of the section name. The type is specified by the `type` option. Depending on an authenticator's type, it may have different options. For more -details, see the type's documentation below. The only option common to all -authenticators is `type`: +details, see the type's [documentation](#authenticator-types) below. The only +option common to all authenticators is `type`: - `type`: The types are specified in [this section](#authenticator-types). @@ -127,28 +127,47 @@ testing different setups. The various delay options are meant to make the crawler simulate a slower, network-based crawler. - `target`: Path to the local directory to crawl. (Required) -- `crawl_delay`: Maximum artificial delay (in seconds) to simulate for crawl - requests. (Default: 0.0) -- `download_delay`: Maximum artificial delay (in seconds) to simulate for - download requests. (Default: 0.0) +- `crawl_delay`: Artificial delay (in seconds) to simulate for crawl requests. + (Default: `0.0`) +- `download_delay`: Artificial delay (in seconds) to simulate for download + requests. (Default: `0.0`) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) -### The `kit-ilias` crawler +### The `kit-ilias-web` crawler -This crawler crawls the KIT ILIAS instance. It performs remote calls to a poor SCC-Server, so you should be nice and use reasonable delays and concurrent requests. -- `target`: The ILIAS element to crawl. Can be: - - `desktop` if you want to crawl your personal desktop - - `` if you want to crawl the course with the given id - - `` if you want to crawl a given element by URL (preferably the permanent URL linked at the bottom of an ILIAS page) -- `tfa_auth`: Like `auth` but only used for two-factor authentication -- `link_file_redirect_delay`: PFERD will create local HTML for external links. - If this property is set to a non-negative value it configures the amount of seconds after which the local HTML - file will redirect you to the link target. -- `link_file_plain_text`: If this is set to true, PFERD will generate plain-text files containing only the link - target for external links. If this is false or not specified, PFERD will generate a neat, pretty and functional - HTML page instead. -- `videos`: If this is set to false, PFERD will not crawl or download any videos. -- `http_timeout`: The timeout for http requests +This crawler crawls the KIT ILIAS instance. + +ILIAS is not great at handling too many concurrent requests. To avoid +unnecessary load, please limit `tasks` to `1`. + +There is a spike in ILIAS usage at the beginning of lectures, so please don't +run PFERD during those times. + +If you're automatically running PFERD periodically (e. g. via cron or a systemd +timer), please randomize the start time or at least don't use the full hour. For +systemd timers, this can be accomplished using the `RandomizedDelaySec` option. +Also, please schedule the script to run in periods of low activity. Running the +script once per day should be fine. + +- `target`: The ILIAS element to crawl. (Required) + - `desktop`: Crawl your personal desktop + - ``: Crawl the course with the given id + - ``: Crawl a given element by URL (preferably the permanent URL linked + at the bottom of its ILIAS page) +- `auth`: Name of auth section to use for login. (Required) +- `tfa_auth`: Name of auth section to use for two-factor authentication. Only + uses the auth section's password. (Default: Anonymous `tfa` authenticator) +- `links`: How to represent external links. (Default: `fancy`) + - `ignore`: Don't download links. + - `plaintext`: A text file containing only the URL. + - `fancy`: A HTML file looking like the ILIAS link element. + - `internet-shortcut`: An internet shortcut file (`.url` file). +- `link_redirect_delay`: Time (in seconds) until `fancy` link files will + redirect to the actual URL. Set to a negative value to disable the automatic + redirect. (Default: `-1`) +- `videos`: Whether to download videos. (Default: `no`) +- `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: + `20.0`) ## Authenticator types @@ -161,21 +180,24 @@ via the terminal. - `username`: The username. (Optional) - `password`: The password. (Optional) +### The `keyring` authenticator + +This authenticator uses the system keyring to store passwords. The username can +be set directly in the config file. If the username is not specified, the user +is prompted via the terminal. If the keyring contains no entry or the entry is +incorrect, the user is prompted for a password via the terminal and the password +is stored in the keyring. + +- `username`: The username. (Optional) +- `keyring_name`: The service name PFERD uses for storing credentials. (Default: + `PFERD`) + ### The `tfa` authenticator This authenticator prompts the user on the console for a two-factor authentication token. The token is provided as password and it is not cached. This authenticator does not support usernames. -### The `keyring` authenticator - -This authenticator uses the system keyring to store passwords. It expects a -username in the config and will prompt *once* for the password. After that it -receives the password from the system keyring. - -- `username`: The username. (Required) -- `keyring_name`: The service name PFERD uses for storing credentials. (Optional) - ## Transformation rules Transformation rules are rules for renaming and excluding files and directories. diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 8323c5c..ccb7134 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -11,14 +11,14 @@ SUBPARSER = SUBPARSERS.add_parser( ) GROUP = SUBPARSER.add_argument_group( - title="KIT ILIAS web-crawler arguments", + title="kit-ilias-web crawler arguments", description="arguments for the 'kit-ilias-web' crawler", ) GROUP.add_argument( "target", type=str, metavar="TARGET", - help="course id, 'desktop', or ILIAS https-URL to crawl" + help="course id, 'desktop', or ILIAS URL to crawl" ) GROUP.add_argument( "output", @@ -27,14 +27,9 @@ GROUP.add_argument( help="output directory" ) GROUP.add_argument( - "--videos", - action=BooleanOptionalAction, - help="crawl and download videos" -) -GROUP.add_argument( - "--username", + "--username", "-u", type=str, - metavar="USER_NAME", + metavar="USERNAME", help="user name for authentication" ) GROUP.add_argument( @@ -46,19 +41,24 @@ GROUP.add_argument( "--links", type=show_value_error(Links.from_string), metavar="OPTION", - help="how to treat external links" + help="how to represent external links" ) GROUP.add_argument( - "--link-file-redirect-delay", + "--link-redirect-delay", type=int, metavar="SECONDS", - help="delay before external link files redirect you to their target (-1 to disable)" + help="time before 'fancy' links redirect to to their target (-1 to disable)" ) GROUP.add_argument( - "--http-timeout", + "--videos", + action=BooleanOptionalAction, + help="crawl and download videos" +) +GROUP.add_argument( + "--http-timeout", "-t", type=float, metavar="SECONDS", - help="the timeout to use for HTTP requests" + help="timeout for all HTTP requests" ) @@ -66,33 +66,30 @@ def load( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: - parser["crawl:kit-ilias-web"] = {} - section = parser["crawl:kit-ilias-web"] + parser["crawl:ilias"] = {} + section = parser["crawl:ilias"] load_crawler(args, section) section["type"] = "kit-ilias-web" section["target"] = str(args.target) section["output_dir"] = str(args.output) - section["auth"] = "auth:kit-ilias-web" - if args.link_file_redirect_delay is not None: - section["link_file_redirect_delay"] = str(args.link_file_redirect_delay) + section["auth"] = "auth:ilias" if args.links is not None: section["links"] = str(args.links.value) + if args.link_redirect_delay is not None: + section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: - section["videos"] = str(False) + section["videos"] = "yes" if args.videos else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) - parser["auth:kit-ilias-web"] = {} - auth_section = parser["auth:kit-ilias-web"] - + parser["auth:ilias"] = {} + auth_section = parser["auth:ilias"] + auth_section["type"] = "simple" + if args.username is not None: + auth_section["username"] = args.username if args.keyring: auth_section["type"] = "keyring" - else: - auth_section["type"] = "simple" - - if args.username is not None: - auth_section["username"] = str(args.username) SUBPARSER.set_defaults(command=load) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index f26390c..4e3b425 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -77,10 +77,10 @@ CRAWLER_PARSER_GROUP = CRAWLER_PARSER.add_argument_group( description="arguments common to all crawlers", ) CRAWLER_PARSER_GROUP.add_argument( - "--redownload", + "--redownload", "-r", type=show_value_error(Redownload.from_string), metavar="OPTION", - help="when to redownload a file that's already present locally" + help="when to download a file that's already present locally" ) CRAWLER_PARSER_GROUP.add_argument( "--on-conflict", @@ -89,30 +89,35 @@ CRAWLER_PARSER_GROUP.add_argument( help="what to do when local and remote files or directories differ" ) CRAWLER_PARSER_GROUP.add_argument( - "--transform", "-t", + "--transform", "-T", action="append", type=str, metavar="RULE", help="add a single transformation rule. Can be specified multiple times" ) CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-tasks", + "--tasks", "-n", type=int, metavar="N", help="maximum number of concurrent tasks (crawling, downloading)" ) CRAWLER_PARSER_GROUP.add_argument( - "--max-concurrent-downloads", + "--downloads", "-N", type=int, metavar="N", help="maximum number of tasks that may download data at the same time" ) CRAWLER_PARSER_GROUP.add_argument( - "--delay-between-tasks", + "--task-delay", "-d", type=float, metavar="SECONDS", help="time the crawler should wait between subsequent tasks" ) +CRAWLER_PARSER_GROUP.add_argument( + "--windows-paths", + action=BooleanOptionalAction, + help="whether to repair invalid paths on windows" +) def load_crawler( @@ -125,12 +130,14 @@ def load_crawler( section["on_conflict"] = args.on_conflict.value if args.transform is not None: section["transform"] = "\n" + "\n".join(args.transform) - if args.max_concurrent_tasks is not None: - section["max_concurrent_tasks"] = str(args.max_concurrent_tasks) - if args.max_concurrent_downloads is not None: - section["max_concurrent_downloads"] = str(args.max_concurrent_downloads) - if args.delay_between_tasks is not None: - section["delay_between_tasks"] = str(args.delay_between_tasks) + if args.tasks is not None: + section["tasks"] = str(args.tasks) + if args.downloads is not None: + section["downloads"] = str(args.downloads) + if args.task_delay is not None: + section["task_delay"] = str(args.task_delay) + if args.windows_paths is not None: + section["windows_paths"] = "yes" if args.windows_paths else "no" PARSER = argparse.ArgumentParser() @@ -200,6 +207,10 @@ def load_default_section( section["working_dir"] = str(args.working_dir) if args.explain is not None: section["explain"] = "yes" if args.explain else "no" + if args.status is not None: + section["status"] = "yes" if args.status else "no" + if args.report is not None: + section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 420d088..321daa2 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -169,33 +169,33 @@ class CrawlerSection(Section): def transform(self) -> str: return self.s.get("transform", "") - def max_concurrent_tasks(self) -> int: - value = self.s.getint("max_concurrent_tasks", fallback=1) + def tasks(self) -> int: + value = self.s.getint("tasks", fallback=1) if value <= 0: - self.invalid_value("max_concurrent_tasks", value, - "Must be greater than 0") + self.invalid_value("tasks", value, "Must be greater than 0") return value - def max_concurrent_downloads(self) -> int: - tasks = self.max_concurrent_tasks() - value = self.s.getint("max_concurrent_downloads", fallback=None) + def downloads(self) -> int: + tasks = self.tasks() + value = self.s.getint("downloads", fallback=None) if value is None: return tasks if value <= 0: - self.invalid_value("max_concurrent_downloads", value, - "Must be greater than 0") + self.invalid_value("downloads", value, "Must be greater than 0") if value > tasks: - self.invalid_value("max_concurrent_downloads", value, - "Must not be greater than max_concurrent_tasks") + self.invalid_value("downloads", value, "Must not be greater than tasks") return value - def delay_between_tasks(self) -> float: - value = self.s.getfloat("delay_between_tasks", fallback=0.0) + def task_delay(self) -> float: + value = self.s.getfloat("task_delay", fallback=0.0) if value < 0: - self.invalid_value("delay_between_tasks", value, - "Must not be negative") + self.invalid_value("task_delay", value, "Must not be negative") return value + def windows_paths(self) -> bool: + on_windows = os.name == "nt" + return self.s.getboolean("windows_paths", fallback=on_windows) + def auth(self, authenticators: Dict[str, Authenticator]) -> Authenticator: value = self.s.get("auth") if value is None: @@ -205,10 +205,6 @@ class CrawlerSection(Section): self.invalid_value("auth", value, "No such auth section exists") return auth - def windows_paths(self) -> bool: - on_windows = os.name == "nt" - return self.s.getboolean("windows_paths", fallback=on_windows) - class Crawler(ABC): def __init__( @@ -230,9 +226,9 @@ class Crawler(ABC): self.error_free = True self._limiter = Limiter( - task_limit=section.max_concurrent_tasks(), - download_limit=section.max_concurrent_downloads(), - task_delay=section.delay_between_tasks(), + task_limit=section.tasks(), + download_limit=section.downloads(), + task_delay=section.task_delay(), ) self._deduplicator = Deduplicator(section.windows_paths()) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index daafc12..40db52c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -40,18 +40,14 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): self.invalid_value("target", target, "Should be ") def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: - value = self.s.get("tfa_auth") - if not value: + value: Optional[str] = self.s.get("tfa_auth") + if value is None: return None - - auth = authenticators.get(f"auth:{value}") + auth = authenticators.get(value) if auth is None: - self.invalid_value("auth", value, "No such auth section exists") + self.invalid_value("tfa_auth", value, "No such auth section exists") return auth - def link_file_redirect_delay(self) -> int: - return self.s.getint("link_file_redirect_delay", fallback=-1) - def links(self) -> Links: type_str: Optional[str] = self.s.get("links") @@ -63,6 +59,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): except ValueError as e: self.invalid_value("links", type_str, str(e).capitalize()) + def link_redirect_delay(self) -> int: + return self.s.getint("link_redirect_delay", fallback=-1) + def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) @@ -173,7 +172,7 @@ class KitIliasWebCrawler(HttpCrawler): self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() - self._link_file_redirect_delay = section.link_file_redirect_delay() + self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() From d905e95dbb6368678c37c09a187a4e571a818c4c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 15:02:35 +0200 Subject: [PATCH 252/524] Allow invalidation of keyring authenticator --- PFERD/auth/keyring.py | 49 ++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index 326f629..b63bf90 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -4,17 +4,14 @@ import keyring from ..config import Config from ..logging import log -from ..utils import agetpass +from ..utils import agetpass, ainput from ..version import NAME from .authenticator import Authenticator, AuthError, AuthSection class KeyringAuthSection(AuthSection): - def username(self) -> str: - name = self.s.get("username") - if name is None: - self.missing_value("username") - return name + def username(self) -> Optional[str]: + return self.s.get("username") def keyring_name(self) -> str: return self.s.get("keyring_name", fallback=NAME) @@ -34,23 +31,41 @@ class KeyringAuthenticator(Authenticator): self._password: Optional[str] = None self._keyring_name = section.keyring_name() + self._password_invalidated = False + self._username_fixed = section.username() is not None + async def credentials(self) -> Tuple[str, str]: - if self._password is not None: - return self._username, self._password - - password = keyring.get_password(self._keyring_name, self._username) - - if not password: + # Request the username + if self._username is None: async with log.exclusive_output(): - password = await agetpass("Password: ") - keyring.set_password(self._keyring_name, self._username, password) + self._username = await ainput("Username: ") - self._password = password + # First try looking it up in the keyring. + # Do not look it up if it was invalidated - we want to re-prompt in this case + if self._password is None and not self._password_invalidated: + self._password = keyring.get_password(self._keyring_name, self._username) - return self._username, password + # If that fails it wasn't saved in the keyring - we need to + # read it from the user and store it + if self._password is None: + async with log.exclusive_output(): + self._password = await agetpass("Password: ") + keyring.set_password(self._keyring_name, self._username, self._password) + + self._password_invalidated = False + return self._username, self._password def invalidate_credentials(self) -> None: + if not self._username_fixed: + self.invalidate_username() self.invalidate_password() + def invalidate_username(self) -> None: + if self._username_fixed: + raise AuthError("Configured username is invalid") + else: + self._username = None + def invalidate_password(self) -> None: - raise AuthError("Invalid password") + self._password = None + self._password_invalidated = True From 0096a0c07779d9ccd054d0dd1b98f045d2e6c13d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 15:11:33 +0200 Subject: [PATCH 253/524] Remove section and config parameter from Authenticator --- PFERD/auth/__init__.py | 6 +++--- PFERD/auth/authenticator.py | 6 ++---- PFERD/auth/keyring.py | 4 +--- PFERD/auth/simple.py | 4 +--- PFERD/auth/tfa.py | 7 ++----- 5 files changed, 9 insertions(+), 18 deletions(-) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index af38859..6e7fd3a 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -15,9 +15,9 @@ AuthConstructor = Callable[[ AUTHENTICATORS: Dict[str, AuthConstructor] = { "simple": lambda n, s, c: - SimpleAuthenticator(n, SimpleAuthSection(s), c), + SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: - TfaAuthenticator(n, AuthSection(s), c), + TfaAuthenticator(n), "keyring": lambda n, s, c: - KeyringAuthenticator(n, KeyringAuthSection(s), c) + KeyringAuthenticator(n, KeyringAuthSection(s)) } diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index 5f09f92..fe14909 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import Tuple -from ..config import Config, Section +from ..config import Section class AuthLoadError(Exception): @@ -19,9 +19,7 @@ class AuthSection(Section): class Authenticator(ABC): def __init__( self, - name: str, - section: AuthSection, - config: Config, + name: str ) -> None: """ Initialize an authenticator from its name and its section in the config diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index b63bf90..c7ca2c2 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -2,7 +2,6 @@ from typing import Optional, Tuple import keyring -from ..config import Config from ..logging import log from ..utils import agetpass, ainput from ..version import NAME @@ -23,9 +22,8 @@ class KeyringAuthenticator(Authenticator): self, name: str, section: KeyringAuthSection, - config: Config, ) -> None: - super().__init__(name, section, config) + super().__init__(name) self._username = section.username() self._password: Optional[str] = None diff --git a/PFERD/auth/simple.py b/PFERD/auth/simple.py index 7fbb60b..d2f4123 100644 --- a/PFERD/auth/simple.py +++ b/PFERD/auth/simple.py @@ -1,6 +1,5 @@ from typing import Optional, Tuple -from ..config import Config from ..logging import log from ..utils import agetpass, ainput from .authenticator import Authenticator, AuthError, AuthSection @@ -19,9 +18,8 @@ class SimpleAuthenticator(Authenticator): self, name: str, section: SimpleAuthSection, - config: Config, ) -> None: - super().__init__(name, section, config) + super().__init__(name) self._username = section.username() self._password = section.password() diff --git a/PFERD/auth/tfa.py b/PFERD/auth/tfa.py index 3efabe1..28ba150 100644 --- a/PFERD/auth/tfa.py +++ b/PFERD/auth/tfa.py @@ -1,19 +1,16 @@ from typing import Tuple -from ..config import Config from ..logging import log from ..utils import ainput -from .authenticator import Authenticator, AuthError, AuthSection +from .authenticator import Authenticator, AuthError class TfaAuthenticator(Authenticator): def __init__( self, name: str, - section: AuthSection, - config: Config, ) -> None: - super().__init__(name, section, config) + super().__init__(name) async def username(self) -> str: raise AuthError("TFA authenticator does not support usernames") From 486699cef3038e3cb00371383af22162e28631c4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 15:11:52 +0200 Subject: [PATCH 254/524] Create anonymous TFA authenticator in ilias crawler This ensures that *some* TFA authenticator is always present when authenticating, even if none is specified in the config. The TfaAuthenticator does not depend on any configured values, so it can be created on-demand. --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 40db52c..6013d77 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -7,7 +7,7 @@ import aiohttp from aiohttp import hdrs from bs4 import BeautifulSoup, Tag -from ...auth import Authenticator +from ...auth import Authenticator, TfaAuthenticator from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload @@ -523,7 +523,7 @@ class KitShibbolethLogin: soup: BeautifulSoup ) -> BeautifulSoup: if not self._tfa_auth: - raise RuntimeError("No 'tfa_auth' present but you use two-factor authentication!") + self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") tfa_token = await self._tfa_auth.password() From 980578d05ae08103453a7683bad4312c56f5f7c5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:12:07 +0200 Subject: [PATCH 255/524] Avoid downloading in some cases Depending on how on_conflict is set, we can determine a few situations where downloading is never necessary. --- PFERD/output_dir.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index fa7babe..ea3a3e0 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -197,13 +197,30 @@ class OutputDirectory: local_path: Path, heuristics: Heuristics, redownload: Redownload, + on_conflict: OnConflict, ) -> bool: - # If we don't have a *file* at the local path, we'll always redownload - # since we know that the remote is different from the local files. This - # includes the case where no local file exists. - if not local_path.is_file(): + if not local_path.exists(): log.explain("No corresponding file present locally") - # TODO Don't download if on_conflict is LOCAL_FIRST or NO_DELETE + return True + + if on_conflict == OnConflict.LOCAL_FIRST: + # Whatever is here, it will never be overwritten, so we don't need + # to download the file. + log.explain("Conflict resolution is 'local-first' and path exists") + return False + + if not local_path.is_file(): + # We know that there is *something* here that's not a file. + log.explain("Non-file (probably a directory) present locally") + + # If on_conflict is LOCAL_FIRST or NO_DELETE, we know that it would + # never be overwritten. It also doesn't have any relevant stats to + # update. This means that we don't have to download the file + # because we'd just always throw it away again. + if on_conflict in {OnConflict.LOCAL_FIRST, OnConflict.NO_DELETE}: + log.explain(f"Conflict resolution is {on_conflict.value!r}") + return False + return True log.explain(f"Redownload policy is {redownload.value}") @@ -363,7 +380,7 @@ class OutputDirectory: self._report.mark(path) - if not self._should_download(local_path, heuristics, redownload): + if not self._should_download(local_path, heuristics, redownload, on_conflict): return None # Detect and solve local-dir-remote-file conflict From edb52a989eb61bac99800950d4a31e604ebd96ae Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:35:36 +0200 Subject: [PATCH 256/524] Print report even if exiting due to Ctrl+C --- PFERD/__main__.py | 22 +++++++++--- PFERD/config.py | 2 ++ PFERD/pferd.py | 92 +++++++++++++++++++++++------------------------ 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 0fbce59..5fd9447 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -107,15 +107,22 @@ def main() -> None: try: pferd = Pferd(config, args.crawler) - asyncio.run(pferd.run()) - except (PferdLoadError, ConfigOptionError) as e: + except PferdLoadError as e: log.unlock() log.error(str(e)) exit(1) + + error = False + try: + asyncio.run(pferd.run()) + except ConfigOptionError as e: + log.unlock() + log.error(str(e)) + error = True except RuleParseError as e: log.unlock() e.pretty_print() - exit(1) + error = True except KeyboardInterrupt: log.unlock() log.explain_topic("Interrupted, exiting immediately") @@ -123,9 +130,14 @@ def main() -> None: log.explain("Temporary files are not cleaned up") # TODO Clean up tmp files # And when those files *do* actually get cleaned up properly, - # reconsider what exit code to use here. - exit(1) + # reconsider if this should be an error + error = True except Exception: log.unlock() log.unexpected_exception() + error = True + + pferd.print_report() + + if error: exit(1) diff --git a/PFERD/config.py b/PFERD/config.py index 8293331..4bfada7 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -170,6 +170,7 @@ class Config: def dump_to_stdout(self) -> None: self._parser.write(sys.stdout) + # TODO Rename to "crawl_sections" def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): @@ -178,6 +179,7 @@ class Config: return result + # TODO Rename to "auth_sections" def authenticator_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index dbb8983..c0b48a7 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -15,20 +15,53 @@ class PferdLoadError(Exception): class Pferd: - def __init__(self, config: Config, crawlers_to_run: Optional[List[str]]): + def __init__(self, config: Config, cli_crawlers: Optional[List[str]]): """ May throw PferdLoadError. """ - if crawlers_to_run is not None and len(crawlers_to_run) != len(set(crawlers_to_run)): - raise PferdLoadError("Some crawlers were selected multiple times") - self._config = config - self._crawlers_to_run = crawlers_to_run + self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers) self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} + def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + crawl_sections = [name for name, _ in config.crawler_sections()] + + if cli_crawlers is None: + log.explain("No crawlers specified on CLI") + log.explain("Running all crawlers specified in config") + return crawl_sections + + if len(cli_crawlers) != len(set(cli_crawlers)): + raise PferdLoadError("Some crawlers were selected multiple times") + + log.explain("Crawlers specified on CLI") + + crawlers_to_run = [] # With crawl: prefix + unknown_names = [] # Without crawl: prefix + + for name in cli_crawlers: + section_name = f"crawl:{name}" + if section_name in crawl_sections: + log.explain(f"Crawler section named {section_name!r} exists") + crawlers_to_run.append(section_name) + else: + log.explain(f"There's no crawler section named {section_name!r}") + unknown_names.append(name) + + if unknown_names: + if len(unknown_names) == 1: + [name] = unknown_names + raise PferdLoadError(f"There is no crawler named {name!r}") + else: + names_str = ", ".join(repr(name) for name in unknown_names) + raise PferdLoadError(f"There are no crawlers named {names_str}") + + return crawlers_to_run + def _load_authenticators(self) -> None: for name, section in self._config.authenticator_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") @@ -40,15 +73,12 @@ class Pferd: authenticator = authenticator_constructor(name, section, self._config) self._authenticators[name] = authenticator - def _load_crawlers(self) -> List[str]: - names = [] - + def _load_crawlers(self) -> None: # Cookie sharing kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} for name, section in self._config.crawler_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - names.append(name) crawl_type = section.get("type") crawler_constructor = CRAWLERS.get(crawl_type) @@ -62,55 +92,20 @@ class Pferd: if isinstance(crawler, KitIliasWebCrawler): crawler.share_cookies(kit_ilias_web_paths) - return names - - def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: - log.explain_topic("Deciding which crawlers to run") - - if self._crawlers_to_run is None: - log.explain("No crawlers specified on CLI") - log.explain("Running all loaded crawlers") - return loaded_crawlers - - log.explain("Crawlers specified on CLI") - - names: List[str] = [] # With 'crawl:' prefix - unknown_names = [] # Without 'crawl:' prefix - - for name in self._crawlers_to_run: - section_name = f"crawl:{name}" - if section_name in self._crawlers: - log.explain(f"Crawler section named {section_name!r} exists") - names.append(section_name) - else: - log.explain(f"There's no crawler section named {section_name!r}") - unknown_names.append(name) - - if unknown_names: - if len(unknown_names) == 1: - [name] = unknown_names - raise PferdLoadError(f"There is no crawler named {name!r}") - else: - names_str = ", ".join(repr(name) for name in unknown_names) - raise PferdLoadError(f"There are no crawlers named {names_str}") - - return names - async def run(self) -> None: """ - May throw PferdLoadError or ConfigOptionError. + May throw ConfigOptionError. """ # These two functions must run inside the same event loop as the # crawlers, so that any new objects (like Conditions or Futures) can # obtain the correct event loop. self._load_authenticators() - loaded_crawlers = self._load_crawlers() - names = self._find_crawlers_to_run(loaded_crawlers) + self._load_crawlers() log.print("") - for name in names: + for name in self._crawlers_to_run: crawler = self._crawlers[name] log.print(f"[bold bright_cyan]Running[/] {escape(name)}") @@ -122,7 +117,8 @@ class Pferd: except Exception: log.unexpected_exception() - for name in names: + def print_report(self) -> None: + for name in self._crawlers_to_run: crawler = self._crawlers[name] log.report("") From f68849c65f37d7fa3949466bdd67039ed67a07ae Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:42:46 +0200 Subject: [PATCH 257/524] Fix rules not being parsed entirely --- PFERD/__main__.py | 16 ++++++---------- PFERD/pferd.py | 4 +++- PFERD/transformer.py | 7 +++++++ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 5fd9447..b7c5fa9 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -112,32 +112,28 @@ def main() -> None: log.error(str(e)) exit(1) - error = False try: asyncio.run(pferd.run()) except ConfigOptionError as e: log.unlock() log.error(str(e)) - error = True + exit(1) except RuleParseError as e: log.unlock() e.pretty_print() - error = True + exit(1) except KeyboardInterrupt: log.unlock() log.explain_topic("Interrupted, exiting immediately") log.explain("Open files and connections are left for the OS to clean up") log.explain("Temporary files are not cleaned up") + pferd.print_report() # TODO Clean up tmp files # And when those files *do* actually get cleaned up properly, - # reconsider if this should be an error - error = True + # reconsider if this should really exit with 1 + exit(1) except Exception: log.unlock() log.unexpected_exception() - error = True - - pferd.print_report() - - if error: + pferd.print_report() exit(1) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index c0b48a7..434407d 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -119,7 +119,9 @@ class Pferd: def print_report(self) -> None: for name in self._crawlers_to_run: - crawler = self._crawlers[name] + crawler = self._crawlers.get(name) + if crawler is None: + continue # Crawler failed to load log.report("") log.report(f"[bold bright_cyan]Report[/] for {escape(name)}") diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 5a20207..0f2a3e6 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -266,6 +266,11 @@ def parse_whitespace(line: Line) -> None: line.advance() +def parse_eol(line: Line) -> None: + if line.get() is not None: + raise RuleParseError(line, "Expected end of line") + + def parse_rule(line: Line) -> Rule: # Parse left side leftindex = line.index @@ -291,6 +296,8 @@ def parse_rule(line: Line) -> Rule: else: rightpath = PurePath(right) + parse_eol(line) + # Dispatch if arrowname == "": return NormalRule(PurePath(left), rightpath) From 40144f8bd80379c39436e8dc24ba7afbf71081ed Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:47:09 +0200 Subject: [PATCH 258/524] Fix rule error messages --- PFERD/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 0f2a3e6..23844f8 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -156,8 +156,8 @@ class Line: return self._line @property - def line_nr(self) -> str: - return self._line + def line_nr(self) -> int: + return self._line_nr @property def index(self) -> int: From 5a331663e46058d3def8c3fc38af7ac075035bcf Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:49:06 +0200 Subject: [PATCH 259/524] Rename functions for consistency --- PFERD/config.py | 6 ++---- PFERD/pferd.py | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index 4bfada7..1462d82 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -170,8 +170,7 @@ class Config: def dump_to_stdout(self) -> None: self._parser.write(sys.stdout) - # TODO Rename to "crawl_sections" - def crawler_sections(self) -> List[Tuple[str, SectionProxy]]: + def crawl_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): if name.startswith("crawl:"): @@ -179,8 +178,7 @@ class Config: return result - # TODO Rename to "auth_sections" - def authenticator_sections(self) -> List[Tuple[str, SectionProxy]]: + def auth_sections(self) -> List[Tuple[str, SectionProxy]]: result = [] for name, proxy in self._parser.items(): if name.startswith("auth:"): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 434407d..df48bd2 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -28,7 +28,7 @@ class Pferd: def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: log.explain_topic("Deciding which crawlers to run") - crawl_sections = [name for name, _ in config.crawler_sections()] + crawl_sections = [name for name, _ in config.crawl_sections()] if cli_crawlers is None: log.explain("No crawlers specified on CLI") @@ -63,7 +63,7 @@ class Pferd: return crawlers_to_run def _load_authenticators(self) -> None: - for name, section in self._config.authenticator_sections(): + for name, section in self._config.auth_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") auth_type = section.get("type") authenticator_constructor = AUTHENTICATORS.get(auth_type) @@ -77,7 +77,7 @@ class Pferd: # Cookie sharing kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} - for name, section in self._config.crawler_sections(): + for name, section in self._config.crawl_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") crawl_type = section.get("type") From aabce764ace344a93c0876b6a304921b0ca09db6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:54:01 +0200 Subject: [PATCH 260/524] Clean up TODOs --- PFERD/crawl/ilias/kit_ilias_html.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 032bb27..4bc3161 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -393,8 +393,7 @@ class IliasPage: ) if modification_date_match is None: modification_date = None - # TODO: Figure out if this is expected or *always* an error. - log.explain(f"Element {name} at {url} has no date. Properties: {all_properties_text!r}") + log.explain(f"Element {name} at {url} has no date.") else: modification_date_str = modification_date_match.group(1) modification_date = demangle_date(modification_date_str) @@ -420,9 +419,6 @@ class IliasPage: if "target=file_" in parsed_url.query: return IliasElementType.FILE - # TODO: Match based on CMD_CLASS or icon? The folder_like check looks at the icon, - # but we could also match the command class. I am not sure what's more stable. - # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc if "ref_id=" in parsed_url.query: From a848194601cb731b80c7cefb690b87864df8a243 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 17:15:13 +0200 Subject: [PATCH 261/524] Rename plaintext link option to "plaintext" --- PFERD/crawl/ilias/file_templates.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 6f2b1cd..151a41b 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -97,14 +97,14 @@ URL={{link}} class Links(Enum): IGNORE = "ignore" - PLAIN = "plain" + PLAINTEXT = "plaintext" FANCY = "fancy" INTERNET_SHORTCUT = "internet-shortcut" def template(self) -> Optional[str]: if self == self.FANCY: return _link_template_fancy - elif self == self.PLAIN: + elif self == self.PLAINTEXT: return _link_template_plain elif self == self.INTERNET_SHORTCUT: return _link_template_internet_shortcut @@ -115,7 +115,7 @@ class Links(Enum): def extension(self) -> Optional[str]: if self == self.FANCY: return ".html" - elif self == self.PLAIN: + elif self == self.PLAINTEXT: return ".txt" elif self == self.INTERNET_SHORTCUT: return ".url" @@ -128,5 +128,5 @@ class Links(Enum): try: return Links(string) except ValueError: - raise ValueError("must be one of 'ignore', 'plain'," + raise ValueError("must be one of 'ignore', 'plaintext'," " 'html', 'internet-shortcut'") From 519a7ef435b8771214e910c2436830dd98ed8022 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 16:57:14 +0200 Subject: [PATCH 262/524] Split --dump-config into two options --dump-config with its optional argument tended to consume the command name, so it had to be split up. --- PFERD/__main__.py | 16 ++++++++++------ PFERD/cli/parser.py | 9 ++++++--- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b7c5fa9..b42c526 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -46,7 +46,7 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: # We want to prevent any unnecessary output if we're printing the config to # stdout, otherwise it would not be a valid config file. - if args.dump_config == "-": + if args.dump_config_to == "-": log.output_explain = False log.output_status = False log.output_report = False @@ -56,7 +56,7 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N # In configure_logging_from_args(), all normal logging is already disabled # whenever we dump the config. We don't want to override that decision with # values from the config file. - if args.dump_config == "-": + if args.dump_config_to == "-": return try: @@ -74,13 +74,17 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N def dump_config(args: argparse.Namespace, config: Config) -> None: log.explain_topic("Dumping config") + if args.dump_config and args.dump_config_to is not None: + log.error("--dump-config and --dump-config-to can't be specified at the same time") + exit(1) + try: - if args.dump_config is True: + if args.dump_config: config.dump() - elif args.dump_config == "-": + elif args.dump_config_to == "-": config.dump_to_stdout() else: - config.dump(Path(args.dump_config)) + config.dump(Path(args.dump_config_to)) except ConfigDumpError as e: log.error(str(e)) log.error_contd(e.reason) @@ -101,7 +105,7 @@ def main() -> None: # all places that were not already covered by CLI args. configure_logging_from_config(args, config) - if args.dump_config is not None: + if args.dump_config or args.dump_config_to is not None: dump_config(args, config) exit() diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 4e3b425..e6b0671 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -155,11 +155,14 @@ PARSER.add_argument( ) PARSER.add_argument( "--dump-config", - nargs="?", - const=True, + action="store_true", + help="dump current configuration to the default config path and exit" +) +PARSER.add_argument( + "--dump-config-to", metavar="PATH", help="dump current configuration to a file and exit." - " Uses default config file path if no path is specified" + " Use '-' as path to print to stdout instead" ) PARSER.add_argument( "--crawler", "-C", From c665c36d88d0d48791e5818effed2c12a83b81e8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 17:16:57 +0200 Subject: [PATCH 263/524] Update README, CHANGELOG --- CHANGELOG.md | 19 ++++++++- CONFIG.md | 2 +- README.md | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 519c046..3f032cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added -- Support for concurrent downloads -- Support for proper config files +- Proper config files +- Concurrent crawling +- Crawl external ILIAS links +- Crawl uploaded exercise solutions +- Explain what PFERD is doing and why (`--explain`) +- More control over output (`--status`, `--report`) +- Print report after exiting via Ctrl+C +- Store crawler reports in `.report` JSON file +- Extensive config file documentation (`CONFIG.md`) +- Documentation for developers (`DEV.md`) - This changelog ### Changed - Rewrote almost everything +- Better error messages - Redesigned CLI +- Redesigned transform rules +- ILIAS crawling logic (paths may be different) +- Better support for weird paths on Windows +- Set user agent (`PFERD/`) ### Removed - Backwards compatibility with 2.x +- Python files as config files +- Some types of crawlers diff --git a/CONFIG.md b/CONFIG.md index 2f2dbbe..b48a2dd 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -90,7 +90,7 @@ full name of an auth section (including the `auth:` prefix). Here is a simple example: -``` +```ini [auth:example] type = simple username = foo diff --git a/README.md b/README.md index f9d718e..8e3b387 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,14 @@ Other resources: - [Changelog](CHANGELOG.md) - [Development Guide](DEV.md) -## Installation with pip +## Installation + +### Direct download + +Binaries for Linux, Windows and Mac can be downloaded directly from the +[latest release](https://github.com/Garmelon/PFERD/releases/latest). + +### With pip Ensure you have at least Python 3.8 installed. Run the following command to install PFERD or upgrade it to the latest version: @@ -18,3 +25,111 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest ``` The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. + +## Basic usage + +PFERD can be run directly from the command line with no config file. +Run `pferd -h` to get an overview of available commands and options. +Run `pferd -h` to see which options a command has. + +For example, you can download your personal desktop from the KIT ILIAS like +this: + +``` +$ pferd kit-ilias-web desktop +``` + +Also, you can download most ILIAS pages directly like this: + +``` +$ pferd kit-ilias-web +``` + +However, the CLI only lets you download a single thing at a time, and the +resulting command can grow long quite quickly. Because of this, PFERD can also +be used with a config file. + +To get started, just take a command you've been using and add `--dump-config` +directly after `pferd`, like this: + +``` +$ pferd --dump-config kit-ilias-web +``` + +This will make PFERD write its current configuration to its default config file +path. You can then run `pferd` without a command and it will execute the config +file. Alternatively, you can use `--dump-config-to` and specify a path yourself. +Using `--dump-config-to -` will print the configuration to stdout instead of a +file, which is a good way to see what is actually going on when using a CLI +command. + +Another good way to see what PFERD is doing is the `--explain` option. When +enabled, PFERD explains in detail what it is doing and why. This can help with +debugging your own config, for example. + +If you don't want to run all crawlers from your config file, you can specify the +crawlers you want to run with `--crawler` or `-C`, like this: + +``` +$ pferd -C crawler1 -C crawler2 +``` + +## Advanced usage + +PFERD supports lots of different options. For example, you can configure PFERD +to [use your system's keyring](CONFIG.md#the-keyring-authenticator) instead of +prompting you for your username and password. PFERD also supports +[transformation rules](CONFIG.md#transformation-rules) that let you rename or +exclude certain files. + +For more details, see the comprehensive [config format documentation](CONFIG.md). + +## Example + +This example downloads a few courses from the KIT ILIAS with a common keyring +authenticator. It reorganizes and ignores some files. + +```ini +[DEFAULT] +# All paths will be relative to this. +# The crawler output directories will be /Foo and /Bar. +working_dir = ~/stud +# If files vanish from ILIAS the local files are not deleted, allowing us to +# take a look at them before deleting them ourselves. +on_conflict = no-delete + +[auth:ilias] +type = keyring +username = foo + +[crawl:Foo] +type = kit-ilias-web +auth = auth:ilias +# Crawl a course by its ID (found as `ref_id=ID` in the URL) +target = 1234567 + +# Plaintext files are easier to read by other tools +links = plaintext + +transform = + # Ignore unneeded folders + Online-Tests --> ! + Vorlesungswerbung --> ! + + # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly + "Übungsunterlagen/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf + # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly + "Übungsunterlagen/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf + + # The course has nested folders with the same name - flatten them + "Übungsunterlagen/(.+?)/\\1/(.*)" -re-> Übung/{g1}/{g2} + + # Rename remaining folders + Übungsunterlagen --> Übung + Lehrbücher --> Vorlesung + +[crawl:Bar] +type = kit-ilias-web +auth = auth:ilias +target = 1337420 +``` From 6644126b5dd87bc587edd4ec77344800d945566a Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 17:29:39 +0200 Subject: [PATCH 264/524] Fix package discovery --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 1cbfc6a..3b6e43b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ name = PFERD version = attr: PFERD.version.VERSION [options] -packages = PFERD +packages = find: python_requires = >=3.8 install_requires = aiohttp>=3.7.4.post0 From f85b75df8c3a945e1bf4a6a087705e51a54e5455 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 17:33:05 +0200 Subject: [PATCH 265/524] Switch from exit() to sys.exit() Pyinstaller doesn't recognize exit(). --- PFERD/__main__.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b42c526..9399a10 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -1,6 +1,7 @@ import argparse import asyncio import configparser +import sys from pathlib import Path from .cli import PARSER, load_default_section @@ -33,7 +34,7 @@ def load_config(args: argparse.Namespace) -> Config: except ConfigLoadError as e: log.error(str(e)) log.error_contd(e.reason) - exit(1) + sys.exit(1) def configure_logging_from_args(args: argparse.Namespace) -> None: @@ -68,7 +69,7 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N log.output_report = config.default_section.report() except ConfigOptionError as e: log.error(str(e)) - exit(1) + sys.exit(1) def dump_config(args: argparse.Namespace, config: Config) -> None: @@ -76,7 +77,7 @@ def dump_config(args: argparse.Namespace, config: Config) -> None: if args.dump_config and args.dump_config_to is not None: log.error("--dump-config and --dump-config-to can't be specified at the same time") - exit(1) + sys.exit(1) try: if args.dump_config: @@ -88,7 +89,7 @@ def dump_config(args: argparse.Namespace, config: Config) -> None: except ConfigDumpError as e: log.error(str(e)) log.error_contd(e.reason) - exit(1) + sys.exit(1) def main() -> None: @@ -107,25 +108,25 @@ def main() -> None: if args.dump_config or args.dump_config_to is not None: dump_config(args, config) - exit() + sys.exit() try: pferd = Pferd(config, args.crawler) except PferdLoadError as e: log.unlock() log.error(str(e)) - exit(1) + sys.exit(1) try: asyncio.run(pferd.run()) except ConfigOptionError as e: log.unlock() log.error(str(e)) - exit(1) + sys.exit(1) except RuleParseError as e: log.unlock() e.pretty_print() - exit(1) + sys.exit(1) except KeyboardInterrupt: log.unlock() log.explain_topic("Interrupted, exiting immediately") @@ -135,9 +136,9 @@ def main() -> None: # TODO Clean up tmp files # And when those files *do* actually get cleaned up properly, # reconsider if this should really exit with 1 - exit(1) + sys.exit(1) except Exception: log.unlock() log.unexpected_exception() pferd.print_report() - exit(1) + sys.exit(1) From 07a75a37c31e80f8e6b90500cde97d356328d553 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 15:57:03 +0000 Subject: [PATCH 266/524] Fix FileNotFoundError on Windows --- PFERD/output_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index ea3a3e0..7883ee0 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -148,7 +148,7 @@ class OutputDirectory: # If you want longer paths, you will have to add the "\\?\" prefix # in front of your path. See: # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation - self._root = Path("\\\\?\\" + str(root)) + self._root = Path("\\\\?\\" + str(root.absolute())) else: self._root = root From 263780e6a3429458dff17945fbac91c2a482451e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 18:09:51 +0200 Subject: [PATCH 267/524] Use certifi to ensure CA certificates are bundled in pyinstaller --- PFERD/crawl/http_crawler.py | 3 +++ setup.cfg | 1 + 2 files changed, 4 insertions(+) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 177972b..9f52c66 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,8 +1,10 @@ import asyncio +import ssl from pathlib import Path, PurePath from typing import Dict, List, Optional import aiohttp +import certifi from aiohttp.client import ClientTimeout from ..auth import Authenticator @@ -155,6 +157,7 @@ class HttpCrawler(Crawler): async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, cookie_jar=self._cookie_jar, + connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())), timeout=ClientTimeout( # 30 minutes. No download in the history of downloads was longer than 30 minutes. # This is enough to transfer a 600 MB file over a 3 Mib/s connection. diff --git a/setup.cfg b/setup.cfg index 3b6e43b..5758282 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,7 @@ install_requires = beautifulsoup4>=4.9.3 rich>=10.1.0 keyring>=23.0.1 + certifi>=2020.12.5 [options.entry_points] console_scripts = From 30be4e29fad0a6ef1abeef99db2c3c944ec46cbd Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 25 May 2021 16:34:18 +0000 Subject: [PATCH 268/524] Add workaround for RuntimeError after program finishes on Windows --- PFERD/__main__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 9399a10..69c819b 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -118,7 +118,10 @@ def main() -> None: sys.exit(1) try: - asyncio.run(pferd.run()) + loop = asyncio.get_event_loop() + loop.run_until_complete(pferd.run()) + loop.run_until_complete(asyncio.sleep(1)) + loop.close() except ConfigOptionError as e: log.unlock() log.error(str(e)) From 66f0e398a10b4d0ec1595f20dcc169a295e56059 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 19:19:51 +0200 Subject: [PATCH 269/524] Await result in tfa authenticate path --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 6013d77..fa68ee7 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -539,7 +539,7 @@ class KitShibbolethLogin: "_eventId_proceed": "", "j_tokenNumber": tfa_token } - return _post(session, url, data) + return await _post(session, url, data) @staticmethod def _login_successful(soup: BeautifulSoup) -> bool: From 2d8dcc87ff19e913459845ccb8b173fc6da6abe2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 May 2021 19:23:06 +0200 Subject: [PATCH 270/524] Send CSRF token in TFA request --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index fa68ee7..60be6d8 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -531,13 +531,15 @@ class KitShibbolethLogin: # credentials rather than after asking. form = soup.find("form", {"method": "post"}) action = form["action"] + csrf_token = form.find("input", {"name": "csrf_token"})["value"] # Equivalent: Enter token in # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO url = "https://idp.scc.kit.edu" + action data = { "_eventId_proceed": "", - "j_tokenNumber": tfa_token + "j_tokenNumber": tfa_token, + "csrf_token": csrf_token } return await _post(session, url, data) From 915e42fd07b28bed722258be1386de669d928f3b Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 May 2021 10:51:41 +0200 Subject: [PATCH 271/524] Fix report not being printed if pferd exits normally --- PFERD/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 69c819b..55be1ea 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -145,3 +145,5 @@ def main() -> None: log.unexpected_exception() pferd.print_report() sys.exit(1) + else: + pferd.print_report() From a879c6ab6eea4adcbe7d41a6fafe3b61c65b1e88 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 May 2021 10:52:04 +0200 Subject: [PATCH 272/524] Fix function being printed --- PFERD/__main__.py | 2 +- PFERD/cli/command_kit_ilias_web.py | 3 +++ PFERD/cli/command_local.py | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 55be1ea..59004ae 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -19,7 +19,7 @@ def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: log.explain("No CLI command specified, loading config from file") Config.load_parser(parser, path=args.config) else: - log.explain(f"CLI command specified, creating config for {args.command!r}") + log.explain("CLI command specified, loading config from its arguments") if args.command: args.command(args, parser) diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index ccb7134..c21b6a4 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -3,6 +3,7 @@ import configparser from pathlib import Path from ..crawl.ilias.file_templates import Links +from ..logging import log from .parser import CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, load_crawler, show_value_error SUBPARSER = SUBPARSERS.add_parser( @@ -66,6 +67,8 @@ def load( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: + log.explain("Creating config for command 'kit-ilias-web'") + parser["crawl:ilias"] = {} section = parser["crawl:ilias"] load_crawler(args, section) diff --git a/PFERD/cli/command_local.py b/PFERD/cli/command_local.py index 73f9d43..309c42f 100644 --- a/PFERD/cli/command_local.py +++ b/PFERD/cli/command_local.py @@ -2,6 +2,7 @@ import argparse import configparser from pathlib import Path +from ..logging import log from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler SUBPARSER = SUBPARSERS.add_parser( @@ -49,6 +50,8 @@ def load( args: argparse.Namespace, parser: configparser.ConfigParser, ) -> None: + log.explain("Creating config for command 'local'") + parser["crawl:local"] = {} section = parser["crawl:local"] load_crawler(args, section) From adb5d4ade37deab0c42af67cd7d2fdbdc46bc483 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 May 2021 10:58:19 +0200 Subject: [PATCH 273/524] Print files that are *not* deleted by cleanup These are files that are not present on the remote source any more, but still present locally. They also show up in the report. --- PFERD/output_dir.py | 3 +++ PFERD/pferd.py | 3 +++ PFERD/report.py | 11 +++++++++++ 3 files changed, 17 insertions(+) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 7883ee0..304101a 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -493,6 +493,9 @@ class OutputDirectory: self._report.delete_file(pure) except OSError: pass + else: + log.status(f"[bold bright_magenta]Not deleted[/] {escape(fmt_path(pure))}") + self._report.not_delete_file(pure) def load_prev_report(self) -> None: log.explain_topic(f"Loading previous report from {fmt_real_path(self._report_path)}") diff --git a/PFERD/pferd.py b/PFERD/pferd.py index df48bd2..7f4d6ff 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -136,6 +136,9 @@ class Pferd: for path in sorted(crawler.report.deleted_files): something_changed = True log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") + for path in sorted(crawler.report.not_deleted_files): + something_changed = True + log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") if not something_changed: log.report(" Nothing changed") diff --git a/PFERD/report.py b/PFERD/report.py index 4f15237..b47490f 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -56,6 +56,7 @@ class Report: self.added_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + self.not_deleted_files: Set[PurePath] = set() @staticmethod def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: @@ -93,6 +94,8 @@ class Report: self.change_file(PurePath(elem)) for elem in self._get_list_of_strs(data, "deleted"): self.delete_file(PurePath(elem)) + for elem in self._get_list_of_strs(data, "not_deleted"): + self.not_delete_file(PurePath(elem)) return self @@ -107,6 +110,7 @@ class Report: "added": [str(path) for path in sorted(self.added_files)], "changed": [str(path) for path in sorted(self.changed_files)], "deleted": [str(path) for path in sorted(self.deleted_files)], + "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], } with open(path, "w") as f: @@ -163,3 +167,10 @@ class Report: """ self.deleted_files.add(path) + + def not_delete_file(self, path: PurePath) -> None: + """ + Unlike mark(), this function accepts any paths. + """ + + self.not_deleted_files.add(path) From 533f75ea71735dab602340317d09f1c1a3f8d559 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 May 2021 11:37:32 +0200 Subject: [PATCH 274/524] Add --debug-transforms flag --- PFERD/__main__.py | 2 +- PFERD/cli/parser.py | 5 +++++ PFERD/crawl/crawler.py | 20 +++++++++++++++++++- PFERD/pferd.py | 15 ++++++++++++++- 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 59004ae..26a1dc4 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -119,7 +119,7 @@ def main() -> None: try: loop = asyncio.get_event_loop() - loop.run_until_complete(pferd.run()) + loop.run_until_complete(pferd.run(args.debug_transforms)) loop.run_until_complete(asyncio.sleep(1)) loop.close() except ConfigOptionError as e: diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index e6b0671..269a19a 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -164,6 +164,11 @@ PARSER.add_argument( help="dump current configuration to a file and exit." " Use '-' as path to print to stdout instead" ) +PARSER.add_argument( + "--debug-transforms", + action="store_true", + help="apply transform rules to files of previous run" +) PARSER.add_argument( "--crawler", "-C", action="append", diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 321daa2..aa0f81c 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -3,7 +3,7 @@ import os from abc import ABC, abstractmethod from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar +from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar from rich.markup import escape @@ -334,3 +334,21 @@ class Crawler(ABC): """ pass + + def debug_transforms(self) -> None: + self._output_dir.load_prev_report() + + if not self.prev_report: + log.warn("Couldn't find or load old report") + return + + seen: Set[PurePath] = set() + for known in self.prev_report.known_files: + looking_at = list(reversed(known.parents)) + [known] + for path in looking_at: + if path in seen: + continue + + log.explain_topic(f"Transforming {fmt_path(path)}") + self._transformer.transform(path) + seen.add(path) diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 7f4d6ff..ac373cf 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -92,7 +92,14 @@ class Pferd: if isinstance(crawler, KitIliasWebCrawler): crawler.share_cookies(kit_ilias_web_paths) - async def run(self) -> None: + def debug_transforms(self) -> None: + for name in self._crawlers_to_run: + crawler = self._crawlers[name] + log.print("") + log.print(f"[bold bright_cyan]Debugging transforms[/] for {escape(name)}") + crawler.debug_transforms() + + async def run(self, debug_transforms: bool) -> None: """ May throw ConfigOptionError. """ @@ -103,6 +110,12 @@ class Pferd: self._load_authenticators() self._load_crawlers() + if debug_transforms: + log.output_explain = True + log.output_report = False + self.debug_transforms() + return + log.print("") for name in self._crawlers_to_run: From 17207546e9cbd76dfb2dc2dd4cb18db72188239b Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 May 2021 11:47:51 +0200 Subject: [PATCH 275/524] Document --debug-transforms --- CHANGELOG.md | 1 + CONFIG.md | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f032cf..8460a94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Crawl uploaded exercise solutions - Explain what PFERD is doing and why (`--explain`) - More control over output (`--status`, `--report`) +- Debug transform rules with `--debug-transforms` - Print report after exiting via Ctrl+C - Store crawler reports in `.report` JSON file - Extensive config file documentation (`CONFIG.md`) diff --git a/CONFIG.md b/CONFIG.md index b48a2dd..06d7dab 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -205,6 +205,9 @@ They are specified line-by-line in a crawler's `transform` option. When a crawler needs to apply a rule to a path, it goes through this list top-to-bottom and choose the first matching rule. +To see this process in action, you can use the `--debug-transforms` or flag or +the `--explain` flag. + Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. `SOURCE` is either a normal path without spaces (e. g. `foo/bar`), or a string literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string From 2c72a9112cea5f812ee3e039297c01044a9cb534 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 27 May 2021 13:20:37 +0200 Subject: [PATCH 276/524] Reword `-name->` and `-name-re->` docs and remove `-name-exact->` --- CONFIG.md | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 06d7dab..fcc263a 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -213,7 +213,7 @@ Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string escape syntax is supported. Trailing slashes are ignored. `TARGET` can be formatted like `SOURCE`, but it can also be a single exclamation mark without -quotes (`!`). `ARROW` is one of `-->`, `-exact->`, `-name->`, `-re->` and +quotes (`!`). `ARROW` is one of `-->`, `-name->`, `-exact->`, `-re->` and `-name-re->` If a rule's target is `!`, this means that when the rule matches on a path, the @@ -230,11 +230,15 @@ well as all its contents. ### The `-name->` arrow -The `-name->` arrow works similar to the `-->` arrow, but pretends it is in the -same directory as the file or directory it is applied to. For example, the rule -`bar -name-> baz` would convert `foo/bar` into `foo/baz` and `foo/bar/xyz` into -`foo/baz/xyz`. The rule `foo --> !` would ignore all files and directories named -`foo` as well as their contents. +The `-name->` arrow lets you rename files and directories by their name, +regardless of where they appear in the file tree. Because of this, its `SOURCE` +must not contain multiple path segments, only a single name. This restriction +does not apply to its `TARGET`. The `-name->` arrow is not applied recursively +to its own output to prevent infinite loops. + +For example, the rule `foo -name-> bar/baz` would convert `a/foo` into +`a/bar/baz` and `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz`. The rule `foo +-name-> !` would ignore all directories and files named `foo`. ### The `-exact->` arrow @@ -244,14 +248,6 @@ but `foo/bar/xyz` would be unaffected. Also, `foo -exact-> !` would only ignore `foo`, but not its contents (if it has any). The examples below show why this is useful. -### The `-name-exact->` arrow - -The `-name-exact->` arrow works similar to the `-exact->` arrow, but pretends it -is in the same directory as the file or directory it is applied to. For example, -the rule `bar -name-exact-> baz` would convert `foo/bar` into `foo/baz` but -`foo/bar/xyz` would be unaffected. The rule `foo --> !` would ignore only ignore -files and directories named `foo`, but not their contents. - ### The `-re->` arrow The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression @@ -275,8 +271,14 @@ example `{g2.lower()}` or `{g3.replace(' ', '_')}`. ### The `-name-re->` arrow -The `-name-re>` arrow works similar to the `-re->` arrow, but pretends it is in -the same directory as the file or directory it is applied to. +The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. +Instead of the `SOURCE` being the name of a directory or file, it's a regex that +is matched against the names of directories and files. `TARGET` works like the +`-re->` arrow's target. + +For example, the arrow `(.*)\.jpeg -name-re-> {g1}.jpg` will rename all `.jpeg` +extensions into `.jpg`. The arrow `\..+ -name-re-> !` will ignore all files and +directories starting with `.`. ### Example: Tutorials From 80acc4b50d8e41f75ef83ad79214cc57cdd00c61 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 27 May 2021 13:42:49 +0200 Subject: [PATCH 277/524] Implement new name arrows --- PFERD/transformer.py | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 23844f8..f147fb2 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -85,12 +85,29 @@ class NameRule(Rule): self._subrule = subrule def transform(self, path: PurePath) -> Union[PurePath, bool]: - name = PurePath(*path.parts[-1:]) - result = self._subrule.transform(name) - if isinstance(result, PurePath): - return path.parent / result - else: + matched = False + result = PurePath() + + for part in path.parts: + part_result = self._subrule.transform(PurePath(part)) + if isinstance(part_result, PurePath): + matched = True + result /= part_result + elif part_result: + # If any subrule call ignores its path segment, the entire path + # should be ignored + return True + else: + # The subrule doesn't modify this segment, but maybe other + # segments + result /= part + + if matched: return result + else: + # The subrule has modified no segments, so this name version of it + # doesn't match + return False class ReRule(Rule): @@ -278,6 +295,7 @@ def parse_rule(line: Line) -> Rule: if isinstance(left, bool): line.index = leftindex raise RuleParseError(line, "Left side can't be '!'") + leftpath = PurePath(left) # Parse arrow parse_whitespace(line) @@ -300,13 +318,14 @@ def parse_rule(line: Line) -> Rule: # Dispatch if arrowname == "": - return NormalRule(PurePath(left), rightpath) + return NormalRule(leftpath, rightpath) elif arrowname == "name": - return NameRule(NormalRule(PurePath(left), rightpath)) + if len(leftpath.parts) > 1: + line.index = leftindex + raise RuleParseError(line, "SOURCE must be a single name, not multiple segments") + return NameRule(ExactRule(leftpath, rightpath)) elif arrowname == "exact": - return ExactRule(PurePath(left), rightpath) - elif arrowname == "name-exact": - return NameRule(ExactRule(PurePath(left), rightpath)) + return ExactRule(leftpath, rightpath) elif arrowname == "re": return ReRule(left, right) elif arrowname == "name-re": From 6fa9cfd4c35566f530fc2a03a4713309f3475db1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 27 May 2021 13:56:01 +0200 Subject: [PATCH 278/524] Fix error when capturing group is None --- CONFIG.md | 4 +++- PFERD/transformer.py | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index fcc263a..f31e7f6 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -258,7 +258,9 @@ are available in `TARGET` for formatting. be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is available as `{i}` (e. g. `{i3}`). If capturing group *n*'s contents are a -valid float, the float value is available as `{f}` (e. g. `{f3}`). +valid float, the float value is available as `{f}` (e. g. `{f3}`). If a +capturing group is not present (e. g. when matching the string `cd` with the +regex `(ab)?cd`), the corresponding variables are not defined. Python's format string syntax has rich options for formatting its arguments. For example, to left-pad the capturing group 3 with the digit `0` to width 5, you diff --git a/PFERD/transformer.py b/PFERD/transformer.py index f147fb2..83ffde4 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -7,7 +7,7 @@ import ast import re from abc import ABC, abstractmethod from pathlib import PurePath -from typing import Dict, Optional, Union +from typing import Dict, Optional, Sequence, Union from .logging import log from .utils import fmt_path @@ -122,8 +122,14 @@ class ReRule(Rule): vars: Dict[str, Union[str, int, float]] = {} - groups = [match[0]] + list(match.groups()) + # For some reason, mypy thinks that "groups" has type List[str]. + # But since elements of "match.groups()" can be None, mypy is + # wrong. + groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) for i, group in enumerate(groups): + if group is None: + continue + vars[f"g{i}"] = group try: @@ -352,7 +358,13 @@ class Transformer: for i, (line, rule) in enumerate(self._rules): log.explain(f"Testing rule {i+1}: {line}") - result = rule.transform(path) + try: + result = rule.transform(path) + except Exception as e: + log.warn(f"Error while testing rule {i+1}: {line}") + log.warn_contd(str(e)) + continue + if isinstance(result, PurePath): log.explain(f"Match found, transformed path to {fmt_path(result)}") return result From 19eed5bdffa9e6f742c85459faf64b80173b4e07 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 27 May 2021 00:31:36 +0200 Subject: [PATCH 279/524] Fix authentication logic conflicts with videos --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 60be6d8..0b20d1c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -360,7 +360,9 @@ class KitIliasWebCrawler(HttpCrawler): page = IliasPage(await self._get_page(element.url), element.url, element) real_element = page.get_child_elements()[0] - await self._stream_from_url(real_element.url, sink, bar) + log.explain(f"Streaming video from real url {real_element.url}") + + await self._stream_from_url(real_element.url, sink, bar, is_video=True) await impl() @@ -374,15 +376,19 @@ class KitIliasWebCrawler(HttpCrawler): async def impl() -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar) + await self._stream_from_url(element.url, sink, bar, is_video=False) await impl() - async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: - async with self.session.get(url, allow_redirects=False) as resp: - # Redirect means we weren't authenticated - if hdrs.LOCATION in resp.headers: + async with self.session.get(url, allow_redirects=is_video) as resp: + if not is_video: + # Redirect means we weren't authenticated + if hdrs.LOCATION in resp.headers: + return False + # we wanted a video but got HTML + if is_video and "html" in resp.content_type: return False if resp.content_length: From 5beb4d9a2d98949b01afd021f8c4b2157bd67281 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 27 May 2021 00:55:46 +0200 Subject: [PATCH 280/524] Fix renaming conflict with multi-stage video elements --- PFERD/crawl/ilias/kit_ilias_html.py | 20 +++++++++++++++----- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 4bc3161..afb7005 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -62,15 +62,17 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_next_stage_url(self) -> Optional[str]: + if self._is_ilias_opencast_embedding(): + return self.get_child_elements()[0].url + return None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) def _is_video_listing(self) -> bool: - # ILIAS fluff around it - if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): - return True + if self._is_ilias_opencast_embedding(): + return True # Raw listing without ILIAS fluff video_element_table: Tag = self._soup.find( @@ -78,6 +80,14 @@ class IliasPage: ) return video_element_table is not None + def _is_ilias_opencast_embedding(self) -> bool: + # ILIAS fluff around the real opencast html + if self._soup.find(id="headerimage"): + element: Tag = self._soup.find(id="headerimage") + if "opencast" in element.attrs["src"].lower(): + return True + return False + def _is_exercise_file(self) -> bool: # we know it from before if self._page_type == IliasElementType.EXERCISE: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 0b20d1c..12a6e79 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -242,10 +242,14 @@ class KitIliasWebCrawler(HttpCrawler): async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) + next_stage_url: Optional[str] = url log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, parent) + + while next_stage_url: + soup = await self._get_page(next_stage_url) + log.explain(f"URL: {url}") + page = IliasPage(soup, url, parent) + next_stage_url = page.get_next_stage_url() elements.extend(page.get_child_elements()) From 474aa7e1cc154b77899cef1c51023d29459f7ee5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 27 May 2021 15:41:00 +0000 Subject: [PATCH 281/524] Use sorted path order when debugging transforms --- PFERD/crawl/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index aa0f81c..87d362f 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -343,7 +343,7 @@ class Crawler(ABC): return seen: Set[PurePath] = set() - for known in self.prev_report.known_files: + for known in sorted(self.prev_report.known_files): looking_at = list(reversed(known.parents)) + [known] for path in looking_at: if path in seen: From 1ca6740e052166397b76b9eb9df3e7c33cf52efc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 27 May 2021 17:59:22 +0200 Subject: [PATCH 282/524] Improve log messages when parsing ILIAS HTML Previously some logs were split around an "await", which isn't a great idea. --- PFERD/crawl/ilias/kit_ilias_html.py | 6 +++--- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index afb7005..a2f30e1 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -142,7 +142,7 @@ class IliasPage: url: str = self._abs_url_from_link(content_link) query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) - log.explain("Found ILIAS redirection page, following it as a new entry") + log.explain("Found ILIAS video frame page, fetching actual content next") return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None @@ -205,7 +205,7 @@ class IliasPage: video_url = self._abs_url_from_link(link) - log.explain(f"Found video {video_name!r} at {video_url!r}") + log.explain(f"Found video {video_name!r} at {video_url}") return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) def _find_exercise_entries(self) -> List[IliasPageElement]: @@ -436,7 +436,7 @@ class IliasPage: _unexpected_html_warning() log.warn_contd( - f"Tried to figure out element type, but failed for {str(element_name)!r} / {link_element!r})" + f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})" ) return None diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 12a6e79..fbbfc1b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -243,11 +243,11 @@ class KitIliasWebCrawler(HttpCrawler): elements.clear() async with cl: next_stage_url: Optional[str] = url - log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") while next_stage_url: soup = await self._get_page(next_stage_url) - log.explain(f"URL: {url}") + log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") + log.explain(f"URL: {next_stage_url}") page = IliasPage(soup, url, parent) next_stage_url = page.get_next_stage_url() From d65efed561f5131b95bed7ac61c5f0036c1609f3 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 28 May 2021 21:21:04 +0000 Subject: [PATCH 283/524] Slightly adjust phrasing --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e3b387..d25e86f 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ command. Another good way to see what PFERD is doing is the `--explain` option. When enabled, PFERD explains in detail what it is doing and why. This can help with -debugging your own config, for example. +debugging your own config. If you don't want to run all crawlers from your config file, you can specify the crawlers you want to run with `--crawler` or `-C`, like this: From b78eb64f3d00f17a60ff68b6e96ec7356ad9eddc Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 29 May 2021 21:38:36 +0200 Subject: [PATCH 284/524] Document versioning scheme --- CHANGELOG.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8460a94..de912c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,24 @@ # Changelog -All notable changes to this project will be documented in this file. +All notable changes to this project will be documented in this file. The format +is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +This project has its own custom versioning scheme. Version numbers consist of +three parts (e. g. `3.1.5`). +- The first number is increased on major rewrites or changes. What classifies as + a major change is up to the maintainers. This is pretty rare and a PFERD + version 4 should hopefully not be necessary. +- The second number is increased on backwards-incompatible changes in behaviour. + This refers to any change that would make an existing setup behave differently + (e. g. renaming options or changing crawler behaviour). If this number is + increased, it may be necessary for you to adapt your own setup. +- The third number is increased on backwards-compatible changes (e. g. adding + new options or commands, changing documentation, fixing bugs). Updates that + only increase this number should be safe and not require manual intervention. + +We will try to correctly classify changes as backwards-compatible or +backwards-incompatible, but may occasionally make mistakes or stumble across +ambiguous situations. ## Unreleased From 84f775013fcbf79e6a31d29f1fb8cbd5351dedfa Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 11:41:20 +0200 Subject: [PATCH 285/524] Use event loop workaround only on windows This avoids an unnecessary one-second sleep on other platforms. However, a better "fix" for this sleep would be a less ugly workaround on windows. --- PFERD/__main__.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 26a1dc4..9d61264 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -1,6 +1,7 @@ import argparse import asyncio import configparser +import os import sys from pathlib import Path @@ -118,10 +119,18 @@ def main() -> None: sys.exit(1) try: - loop = asyncio.get_event_loop() - loop.run_until_complete(pferd.run(args.debug_transforms)) - loop.run_until_complete(asyncio.sleep(1)) - loop.close() + if os.name == "nt": + # A "workaround" for the windows event loop somehow crashing after + # asyncio.run() completes. See: + # https://bugs.python.org/issue39232 + # https://github.com/encode/httpx/issues/914#issuecomment-780023632 + # TODO Fix this properly + loop = asyncio.get_event_loop() + loop.run_until_complete(pferd.run(args.debug_transforms)) + loop.run_until_complete(asyncio.sleep(1)) + loop.close() + else: + asyncio.run(pferd.run(args.debug_transforms)) except ConfigOptionError as e: log.unlock() log.error(str(e)) From 1dd24551a589ef4d2f2ae25c280ad57ab63aceae Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 11:44:17 +0200 Subject: [PATCH 286/524] Add link to repo in --version output --- PFERD/cli/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 269a19a..754b8ad 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -145,7 +145,7 @@ PARSER.set_defaults(command=None) PARSER.add_argument( "--version", action="version", - version=f"{NAME} {VERSION}", + version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)", ) PARSER.add_argument( "--config", "-c", From 17879a7f69c2746b080e696949201e4092c46f4f Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 11:50:20 +0200 Subject: [PATCH 287/524] Print box around message for unexpected exceptions --- PFERD/logging.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index 5025d88..1a07b3e 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -8,6 +8,7 @@ from typing import AsyncIterator, ContextManager, Iterator, List, Optional from rich.console import Console, RenderGroup from rich.live import Live from rich.markup import escape +from rich.panel import Panel from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID, TextColumn, TimeRemainingColumn, TransferSpeedColumn) from rich.table import Column @@ -170,10 +171,13 @@ class Log: self.error_contd("") self.error_contd(traceback.format_exc()) - self.error_contd(""" + # Our print function doesn't take types other than strings, but the + # underlying rich.print function does. This call is a special case + # anyways, and we're calling it internally, so this should be fine. + self.print(Panel.fit(""" Please copy your program output and send it to the PFERD maintainers, either directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new - """.strip()) + """.strip())) # type: ignore def explain_topic(self, text: str) -> None: """ From 64a29607519fdaca1a17ee1bca3676fc83f33920 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 12:21:05 +0200 Subject: [PATCH 288/524] Align paths in status messages and progress bars Also print "Ignored" when paths are ignored due to transforms --- PFERD/crawl/crawler.py | 18 +++++++----------- PFERD/logging.py | 32 +++++++++++++++++++++++++++----- PFERD/output_dir.py | 12 +++++------- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 87d362f..ce69967 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -5,8 +5,6 @@ from datetime import datetime from pathlib import Path, PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar -from rich.markup import escape - from ..auth import Authenticator from ..config import Config, Section from ..deduplicator import Deduplicator @@ -104,12 +102,9 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]): return self._path async def _on_aenter(self) -> ProgressBar: - bar_desc = f"[bold bright_cyan]Crawling[/] {escape(fmt_path(self._path))}" - after_desc = f"[bold cyan]Crawled[/] {escape(fmt_path(self._path))}" - - self._stack.callback(lambda: log.status(after_desc)) + self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path))) await self._stack.enter_async_context(self._limiter.limit_crawl()) - bar = self._stack.enter_context(log.crawl_bar(bar_desc)) + bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path))) return bar @@ -127,12 +122,11 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): return self._path async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]: - bar_desc = f"[bold bright_cyan]Downloading[/] {escape(fmt_path(self._path))}" - # The "Downloaded ..." message is printed in the output dir, not here - await self._stack.enter_async_context(self._limiter.limit_download()) sink = await self._stack.enter_async_context(self._fs_token) - bar = self._stack.enter_context(log.download_bar(bar_desc)) + # The "Downloaded ..." message is printed in the output dir, not here + bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading", + fmt_path(self._path))) return bar, sink @@ -273,6 +267,7 @@ class Crawler(ABC): if self._transformer.transform(path) is None: log.explain("Answer: No") + log.status("[bold bright_black]", "Ignored", fmt_path(path)) return None log.explain("Answer: Yes") @@ -291,6 +286,7 @@ class Crawler(ABC): transformed_path = self._transformer.transform(path) if transformed_path is None: log.explain("Answer: No") + log.status("[bold bright_black]", "Ignored", fmt_path(path)) return None fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) diff --git a/PFERD/logging.py b/PFERD/logging.py index 1a07b3e..32e5268 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -28,6 +28,8 @@ class ProgressBar: class Log: + STATUS_WIDTH = 11 + def __init__(self) -> None: self.console = Console(highlight=False) @@ -195,13 +197,15 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def status(self, text: str) -> None: + def status(self, style: str, action: str, text: str) -> None: """ - Print a status update while crawling. Allows markup. + Print a status update while crawling. Allows markup in the "style" + argument which will be applied to the "action" string. """ if self.output_status: - self.print(text) + action = escape(f"{action:<{self.STATUS_WIDTH}}") + self.print(f"{style}{action}[/] {escape(text)}") def report(self, text: str) -> None: """ @@ -233,16 +237,34 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new def crawl_bar( self, - description: str, + style: str, + action: str, + text: str, total: Optional[float] = None, ) -> ContextManager[ProgressBar]: + """ + Allows markup in the "style" argument which will be applied to the + "action" string. + """ + + action = escape(f"{action:<{self.STATUS_WIDTH}}") + description = f"{style}{action}[/] {text}" return self._bar(self._crawl_progress, description, total) def download_bar( self, - description: str, + style: str, + action: str, + text: str, total: Optional[float] = None, ) -> ContextManager[ProgressBar]: + """ + Allows markup in the "style" argument which will be applied to the + "action" string. + """ + + action = escape(f"{action:<{self.STATUS_WIDTH}}") + description = f"{style}{action}[/] {text}" return self._bar(self._download_progress, description, total) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 304101a..0fb9911 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -11,8 +11,6 @@ from enum import Enum from pathlib import Path, PurePath from typing import BinaryIO, Iterator, Optional, Tuple -from rich.markup import escape - from .logging import log from .report import Report, ReportLoadError from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_yes_no @@ -425,7 +423,7 @@ class OutputDirectory: async def _after_download(self, info: DownloadInfo) -> None: with self._ensure_deleted(info.tmp_path): - log.status(f"[bold cyan]Downloaded[/] {fmt_path(info.remote_path)}") + log.status("[bold cyan]", "Downloaded", fmt_path(info.remote_path)) log.explain_topic(f"Processing downloaded file for {fmt_path(info.path)}") changed = False @@ -456,10 +454,10 @@ class OutputDirectory: self._update_metadata(info) if changed: - log.status(f"[bold bright_yellow]Changed[/] {escape(fmt_path(info.path))}") + log.status("[bold bright_yellow]", "Changed", fmt_path(info.path)) self._report.change_file(info.path) else: - log.status(f"[bold bright_green]Added[/] {escape(fmt_path(info.path))}") + log.status("[bold bright_green]", "Added", fmt_path(info.path)) self._report.add_file(info.path) async def cleanup(self) -> None: @@ -489,12 +487,12 @@ class OutputDirectory: if await self._conflict_delete_lf(self._on_conflict, pure): try: path.unlink() - log.status(f"[bold bright_magenta]Deleted[/] {escape(fmt_path(pure))}") + log.status("[bold bright_magenta]", "Deleted", fmt_path(pure)) self._report.delete_file(pure) except OSError: pass else: - log.status(f"[bold bright_magenta]Not deleted[/] {escape(fmt_path(pure))}") + log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) self._report.not_delete_file(pure) def load_prev_report(self) -> None: From 7b062883f619238b9992834c39484e9973a172f9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 12:28:11 +0200 Subject: [PATCH 289/524] Use raw paths for --debug-transforms Previously, the already-transformed paths were used, which meant that --debug-transforms was cumbersome to use (as you had to remove all transforms and crawl once before getting useful results). --- PFERD/crawl/crawler.py | 4 +++- PFERD/report.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index ce69967..e990f16 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -264,6 +264,7 @@ class Crawler(ABC): async def crawl(self, path: PurePath) -> Optional[CrawlToken]: log.explain_topic(f"Decision: Crawl {fmt_path(path)}") path = self._deduplicator.mark(path) + self._output_dir.report.found(path) if self._transformer.transform(path) is None: log.explain("Answer: No") @@ -282,6 +283,7 @@ class Crawler(ABC): ) -> Optional[DownloadToken]: log.explain_topic(f"Decision: Download {fmt_path(path)}") path = self._deduplicator.mark(path) + self._output_dir.report.found(path) transformed_path = self._transformer.transform(path) if transformed_path is None: @@ -339,7 +341,7 @@ class Crawler(ABC): return seen: Set[PurePath] = set() - for known in sorted(self.prev_report.known_files): + for known in sorted(self.prev_report.found_paths): looking_at = list(reversed(known.parents)) + [known] for path in looking_at: if path in seen: diff --git a/PFERD/report.py b/PFERD/report.py index b47490f..919bb35 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -50,12 +50,22 @@ class Report: """ def __init__(self) -> None: + # Paths found by the crawler, untransformed + self.found_paths: Set[PurePath] = set() + + # Files reserved for metadata files (e. g. the report file or cookies) + # that can't be overwritten by user transforms and won't be cleaned up + # at the end. self.reserved_files: Set[PurePath] = set() + + # Files found by the crawler, transformed. Only includes files that + # were downloaded (or a download was attempted) self.known_files: Set[PurePath] = set() self.added_files: Set[PurePath] = set() self.changed_files: Set[PurePath] = set() self.deleted_files: Set[PurePath] = set() + # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() @staticmethod @@ -84,6 +94,8 @@ class Report: raise ReportLoadError("Incorrect format: Root is not an object") self = cls() + for elem in self._get_list_of_strs(data, "found"): + self.found(PurePath(elem)) for elem in self._get_list_of_strs(data, "reserved"): self.mark_reserved(PurePath(elem)) for elem in self._get_list_of_strs(data, "known"): @@ -105,6 +117,7 @@ class Report: """ data = { + "found": [str(path) for path in sorted(self.found_paths)], "reserved": [str(path) for path in sorted(self.reserved_files)], "known": [str(path) for path in sorted(self.known_files)], "added": [str(path) for path in sorted(self.added_files)], @@ -117,6 +130,9 @@ class Report: json.dump(data, f, indent=2, sort_keys=True) f.write("\n") # json.dump doesn't do this + def found(self, path: PurePath) -> None: + self.found_paths.add(path) + def mark_reserved(self, path: PurePath) -> None: if path in self.marked: raise RuntimeError("Trying to reserve an already reserved file") From 921cec7ddcd183414fa4f4d12cb5ae2dcd14150e Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 12:49:04 +0200 Subject: [PATCH 290/524] Bump version to 3.0.0 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de912c9..5f9ca72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.0.0 - 2021-05-31 + ### Added - Proper config files - Concurrent crawling From 1fba96abcb09c35aa47d3a94c0c758b2457efaa7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 31 May 2021 18:00:42 +0200 Subject: [PATCH 291/524] Fix exercise date parsing for non-group submissions ILIAS apparently changes the order of the fields as it sees fit, so we now try to parse *every* column, starting at from the right, as a date. The first column that parses successfully is then used. --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 17 +++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f9ca72..34c997a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Date parsing now also works correctly in non-group exercises + ## 3.0.0 - 2021-05-31 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index a2f30e1..64491f9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -230,12 +230,16 @@ class IliasPage: parent_row: Tag = link.findParent("tr") children: List[Tag] = parent_row.findChildren("td") - # - # 0 1 2 3 4 name = _sanitize_path_name(children[1].getText().strip()) - date = demangle_date(children[3].getText().strip()) - log.explain(f"Found exercise detail entry {name!r}") + + for child in reversed(children): + date = demangle_date(child.getText().strip(), fail_silently=True) + if date is not None: + break + if date is None: + log.warn(f"Date parsing failed for exercise entry {name!r}") + results.append(IliasPageElement( IliasElementType.FILE, self._abs_url_from_link(link), @@ -522,7 +526,7 @@ german_months = ['Jan', 'Feb', 'Mär', 'Apr', 'Mai', 'Jun', 'Jul', 'Aug', 'Sep', english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] -def demangle_date(date_str: str) -> Optional[datetime]: +def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ Demangle a given date in one of the following formats: "Gestern, HH:MM" @@ -554,7 +558,8 @@ def demangle_date(date_str: str) -> Optional[datetime]: return datetime(year, month, day, hour, minute) except Exception: - log.warn(f"Date parsing failed for {date_str!r}") + if not fail_silently: + log.warn(f"Date parsing failed for {date_str!r}") return None From 9d5ec84b91fcc3ce546710eaf2f40c37679ea8a1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 17:55:56 +0200 Subject: [PATCH 292/524] Add credential file authenticator --- CHANGELOG.md | 3 +++ CONFIG.md | 13 +++++++++++ PFERD/auth/__init__.py | 5 +++- PFERD/auth/credential_file.py | 43 +++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 PFERD/auth/credential_file.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 34c997a..1dffa1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- `credential-file` authenticator + ### Fixed - Date parsing now also works correctly in non-group exercises diff --git a/CONFIG.md b/CONFIG.md index f31e7f6..7826b04 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -180,6 +180,19 @@ via the terminal. - `username`: The username. (Optional) - `password`: The password. (Optional) +### The `credential-file` authenticator + +This authenticator reads a username and a password from a credential file. The +credential file has exactly two lines (trailing newline optional). The first +line starts with `username=` and contains the username, the second line starts +with `password=` and contains the password. The username and password may +contain any characters except a line break. + +``` +username=AzureDiamond +password=hunter2 +``` + ### The `keyring` authenticator This authenticator uses the system keyring to store passwords. The username can diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 6e7fd3a..39f7f5c 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -3,6 +3,7 @@ from typing import Callable, Dict from ..config import Config from .authenticator import Authenticator, AuthError, AuthSection # noqa: F401 +from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection from .keyring import KeyringAuthenticator, KeyringAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -14,10 +15,12 @@ AuthConstructor = Callable[[ ], Authenticator] AUTHENTICATORS: Dict[str, AuthConstructor] = { + "credential-file": lambda n, s, c: + CredentialFileAuthenticator(n, CredentialFileAuthSection(s)), "simple": lambda n, s, c: SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: TfaAuthenticator(n), "keyring": lambda n, s, c: - KeyringAuthenticator(n, KeyringAuthSection(s)) + KeyringAuthenticator(n, KeyringAuthSection(s)), } diff --git a/PFERD/auth/credential_file.py b/PFERD/auth/credential_file.py new file mode 100644 index 0000000..540b65b --- /dev/null +++ b/PFERD/auth/credential_file.py @@ -0,0 +1,43 @@ +from pathlib import Path +from typing import Tuple + +from ..utils import fmt_real_path +from .authenticator import Authenticator, AuthLoadError, AuthSection + + +class CredentialFileAuthSection(AuthSection): + def path(self) -> Path: + value = self.s.get("path") + if value is None: + self.missing_value("path") + return Path(value) + + +class CredentialFileAuthenticator(Authenticator): + def __init__(self, name: str, section: CredentialFileAuthSection) -> None: + super().__init__(name) + + path = section.path() + try: + with open(path) as f: + lines = list(f) + except OSError as e: + raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e + + if len(lines) != 2: + raise AuthLoadError("Credential file must be two lines long") + [uline, pline] = lines + uline = uline[:-1] # Remove trailing newline + if pline.endswith("\n"): + pline = pline[:-1] + + if not uline.startswith("username="): + raise AuthLoadError("First line must start with 'username='") + if not pline.startswith("password="): + raise AuthLoadError("Second line must start with 'password='") + + self._username = uline[:9] + self._password = pline[:9] + + async def credentials(self) -> Tuple[str, str]: + return self._username, self._password From 1ce32d2f18881d3484889c6a20758dbf0a8d59d9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 18:19:05 +0200 Subject: [PATCH 293/524] Add CLI option for credential file auth to kit-ilias-web --- CHANGELOG.md | 1 + PFERD/__main__.py | 5 ++++- PFERD/cli/__init__.py | 5 +++-- PFERD/cli/command_kit_ilias_web.py | 23 +++++++++++++++++++---- PFERD/cli/parser.py | 4 ++++ 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dffa1e..451853b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Added - `credential-file` authenticator +- `--credential-file` option for `kit-ilias-web` command ### Fixed - Date parsing now also works correctly in non-group exercises diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 9d61264..1cca8b1 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -5,7 +5,7 @@ import os import sys from pathlib import Path -from .cli import PARSER, load_default_section +from .cli import PARSER, ParserLoadError, load_default_section from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log from .pferd import Pferd, PferdLoadError @@ -36,6 +36,9 @@ def load_config(args: argparse.Namespace) -> Config: log.error(str(e)) log.error_contd(e.reason) sys.exit(1) + except ParserLoadError as e: + log.error(str(e)) + sys.exit(1) def configure_logging_from_args(args: argparse.Namespace) -> None: diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index f9cb5d2..d70ecd9 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -1,11 +1,12 @@ # isort: skip_file # The order of imports matters because each command module registers itself -# with the parser from ".parser". Because of this, isort is disabled for this +# with the parser from ".parser" and the import order affects the order in +# which they appear in the help. Because of this, isort is disabled for this # file. Also, since we're reexporting or just using the side effect of # importing itself, we get a few linting warnings, which we're disabling as # well. from . import command_local # noqa: F401 imported but unused from . import command_kit_ilias_web # noqa: F401 imported but unused -from .parser import PARSER, load_default_section # noqa: F401 imported but unused +from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index c21b6a4..12803a6 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -4,7 +4,8 @@ from pathlib import Path from ..crawl.ilias.file_templates import Links from ..logging import log -from .parser import CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, load_crawler, show_value_error +from .parser import (CRAWLER_PARSER, SUBPARSERS, BooleanOptionalAction, ParserLoadError, load_crawler, + show_value_error) SUBPARSER = SUBPARSERS.add_parser( "kit-ilias-web", @@ -38,6 +39,12 @@ GROUP.add_argument( action=BooleanOptionalAction, help="use the system keyring to store and retrieve passwords" ) +GROUP.add_argument( + "--credential-file", + type=Path, + metavar="PATH", + help="read username and password from a credential file" +) GROUP.add_argument( "--links", type=show_value_error(Links.from_string), @@ -88,11 +95,19 @@ def load( parser["auth:ilias"] = {} auth_section = parser["auth:ilias"] - auth_section["type"] = "simple" + if args.credential_file is not None: + if args.username is not None: + raise ParserLoadError("--credential-file and --username can't be used together") + if args.keyring: + raise ParserLoadError("--credential-file and --keyring can't be used together") + auth_section["type"] = "credential-file" + auth_section["path"] = str(args.credential_file) + elif args.keyring: + auth_section["type"] = "keyring" + else: + auth_section["type"] = "simple" if args.username is not None: auth_section["username"] = args.username - if args.keyring: - auth_section["type"] = "keyring" SUBPARSER.set_defaults(command=load) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 754b8ad..f5fb215 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -8,6 +8,10 @@ from ..output_dir import OnConflict, Redownload from ..version import NAME, VERSION +class ParserLoadError(Exception): + pass + + # TODO Replace with argparse version when updating to 3.9? class BooleanOptionalAction(argparse.Action): def __init__( From 49ad1b6e463ecd135b13ae6a548eb87e95ea6c55 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 18:21:18 +0200 Subject: [PATCH 294/524] Clean up authenticator code formatting --- PFERD/auth/__init__.py | 4 ++-- PFERD/auth/authenticator.py | 5 +---- PFERD/auth/keyring.py | 6 +----- PFERD/auth/simple.py | 6 +----- PFERD/auth/tfa.py | 5 +---- 5 files changed, 6 insertions(+), 20 deletions(-) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 39f7f5c..eff8370 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -17,10 +17,10 @@ AuthConstructor = Callable[[ AUTHENTICATORS: Dict[str, AuthConstructor] = { "credential-file": lambda n, s, c: CredentialFileAuthenticator(n, CredentialFileAuthSection(s)), + "keyring": lambda n, s, c: + KeyringAuthenticator(n, KeyringAuthSection(s)), "simple": lambda n, s, c: SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: TfaAuthenticator(n), - "keyring": lambda n, s, c: - KeyringAuthenticator(n, KeyringAuthSection(s)), } diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index fe14909..f588bc4 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -17,10 +17,7 @@ class AuthSection(Section): class Authenticator(ABC): - def __init__( - self, - name: str - ) -> None: + def __init__(self, name: str) -> None: """ Initialize an authenticator from its name and its section in the config file. diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index c7ca2c2..c14f6fb 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -18,11 +18,7 @@ class KeyringAuthSection(AuthSection): class KeyringAuthenticator(Authenticator): - def __init__( - self, - name: str, - section: KeyringAuthSection, - ) -> None: + def __init__(self, name: str, section: KeyringAuthSection) -> None: super().__init__(name) self._username = section.username() diff --git a/PFERD/auth/simple.py b/PFERD/auth/simple.py index d2f4123..831c12f 100644 --- a/PFERD/auth/simple.py +++ b/PFERD/auth/simple.py @@ -14,11 +14,7 @@ class SimpleAuthSection(AuthSection): class SimpleAuthenticator(Authenticator): - def __init__( - self, - name: str, - section: SimpleAuthSection, - ) -> None: + def __init__(self, name: str, section: SimpleAuthSection) -> None: super().__init__(name) self._username = section.username() diff --git a/PFERD/auth/tfa.py b/PFERD/auth/tfa.py index 28ba150..26b1383 100644 --- a/PFERD/auth/tfa.py +++ b/PFERD/auth/tfa.py @@ -6,10 +6,7 @@ from .authenticator import Authenticator, AuthError class TfaAuthenticator(Authenticator): - def __init__( - self, - name: str, - ) -> None: + def __init__(self, name: str) -> None: super().__init__(name) async def username(self) -> str: From f40820c41fd110465ad5cb2172e3f547a2b6fea6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 21:07:13 +0200 Subject: [PATCH 295/524] Warn if using concurrent tasks with kit-ilias-web --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 451853b..51e9a5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - `credential-file` authenticator - `--credential-file` option for `kit-ilias-web` command +- Warning if using concurrent tasks with `kit-ilias-web` ### Fixed - Date parsing now also works correctly in non-group exercises diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index fbbfc1b..78428e0 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -21,7 +21,6 @@ TargetType = Union[str, int] class KitIliasWebCrawlerSection(HttpCrawlerSection): - def target(self) -> TargetType: target = self.s.get("target") if not target: @@ -164,6 +163,12 @@ class KitIliasWebCrawler(HttpCrawler): auth = section.auth(authenticators) super().__init__(name, section, config, shared_auth=auth) + if section.tasks() > 1: + log.warn(""" +Please avoid using too many parallel requests as these are the KIT ILIAS +instance's greatest bottleneck. + """.strip()) + self._shibboleth_login = KitShibbolethLogin( auth, section.tfa_auth(authenticators), From 722970a2556e0c24584bc46fd088b24eea8fc406 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 May 2021 20:04:56 +0000 Subject: [PATCH 296/524] Store cookies in text-based format Using the stdlib's http.cookie module, cookies are now stored as one "Set-Cookie" header per line. Previously, the aiohttp.CookieJar's save() and load() methods were used (which use pickling). --- CHANGELOG.md | 3 +++ PFERD/crawl/http_crawler.py | 26 +++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51e9a5b..f7e33ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ ambiguous situations. - `--credential-file` option for `kit-ilias-web` command - Warning if using concurrent tasks with `kit-ilias-web` +### Changed +- Cookies are now stored in a text-based format + ### Fixed - Date parsing now also works correctly in non-group exercises diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 9f52c66..fa4cf29 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,7 +1,8 @@ import asyncio +import http.cookies import ssl from pathlib import Path, PurePath -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional import aiohttp import certifi @@ -105,6 +106,25 @@ class HttpCrawler(Crawler): self._shared_cookie_jar_paths.append(self._cookie_jar_path) + def _load_cookies_from_file(self, path: Path) -> None: + jar: Any = http.cookies.SimpleCookie() + with open(path) as f: + for i, line in enumerate(f): + # Names of headers are case insensitive + if line[:11].lower() == "set-cookie:": + jar.load(line[11:]) + else: + log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it") + self._cookie_jar.update_cookies(jar) + + def _save_cookies_to_file(self, path: Path) -> None: + jar: Any = http.cookies.SimpleCookie() + for morsel in self._cookie_jar: + jar[morsel.key] = morsel + with open(path, "w") as f: + f.write(jar.output(sep="\n")) + f.write("\n") # A trailing newline is just common courtesy + def _load_cookies(self) -> None: log.explain_topic("Loading cookies") @@ -134,7 +154,7 @@ class HttpCrawler(Crawler): log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") try: - self._cookie_jar.load(cookie_jar_path) + self._load_cookies_from_file(cookie_jar_path) except Exception as e: log.explain("Failed to load cookies") log.explain(str(e)) @@ -144,7 +164,7 @@ class HttpCrawler(Crawler): try: log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") - self._cookie_jar.save(self._cookie_jar_path) + self._save_cookies_to_file(self._cookie_jar_path) except Exception as e: log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(str(e)) From f6b26f4eade09f65986e87019b392f3f2c019b88 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 09:10:58 +0000 Subject: [PATCH 297/524] Fix unexpected exception when credential file not found --- PFERD/__main__.py | 3 ++- PFERD/auth/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 1cca8b1..5ae62bb 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -5,6 +5,7 @@ import os import sys from pathlib import Path +from .auth import AuthLoadError from .cli import PARSER, ParserLoadError, load_default_section from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError from .logging import log @@ -134,7 +135,7 @@ def main() -> None: loop.close() else: asyncio.run(pferd.run(args.debug_transforms)) - except ConfigOptionError as e: + except (ConfigOptionError, AuthLoadError) as e: log.unlock() log.error(str(e)) sys.exit(1) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index eff8370..06b3ba4 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -2,7 +2,7 @@ from configparser import SectionProxy from typing import Callable, Dict from ..config import Config -from .authenticator import Authenticator, AuthError, AuthSection # noqa: F401 +from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection # noqa: F401 from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection from .keyring import KeyringAuthenticator, KeyringAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection From e1bda94329f5f0ff6c4a94a86f25d13efdb0d66d Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 09:18:08 +0000 Subject: [PATCH 298/524] Load credential file from correct path --- PFERD/auth/__init__.py | 2 +- PFERD/auth/credential_file.py | 5 +++-- PFERD/config.py | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 06b3ba4..277cade 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -16,7 +16,7 @@ AuthConstructor = Callable[[ AUTHENTICATORS: Dict[str, AuthConstructor] = { "credential-file": lambda n, s, c: - CredentialFileAuthenticator(n, CredentialFileAuthSection(s)), + CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), "keyring": lambda n, s, c: KeyringAuthenticator(n, KeyringAuthSection(s)), "simple": lambda n, s, c: diff --git a/PFERD/auth/credential_file.py b/PFERD/auth/credential_file.py index 540b65b..30a56ba 100644 --- a/PFERD/auth/credential_file.py +++ b/PFERD/auth/credential_file.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Tuple +from ..config import Config from ..utils import fmt_real_path from .authenticator import Authenticator, AuthLoadError, AuthSection @@ -14,10 +15,10 @@ class CredentialFileAuthSection(AuthSection): class CredentialFileAuthenticator(Authenticator): - def __init__(self, name: str, section: CredentialFileAuthSection) -> None: + def __init__(self, name: str, section: CredentialFileAuthSection, config: Config) -> None: super().__init__(name) - path = section.path() + path = config.default_section.working_dir() / section.path() try: with open(path) as f: lines = list(f) diff --git a/PFERD/config.py b/PFERD/config.py index 1462d82..0ea7abc 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -69,6 +69,7 @@ class Section: class DefaultSection(Section): def working_dir(self) -> Path: + # TODO Change to working dir instead of manually prepending it to paths pathstr = self.s.get("working_dir", ".") return Path(pathstr).expanduser() From f656e3ff34b05b486a4f5ad63ec6174e4080cb0d Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 09:18:17 +0000 Subject: [PATCH 299/524] Fix credential parsing --- PFERD/auth/credential_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/auth/credential_file.py b/PFERD/auth/credential_file.py index 30a56ba..d0fcdda 100644 --- a/PFERD/auth/credential_file.py +++ b/PFERD/auth/credential_file.py @@ -37,8 +37,8 @@ class CredentialFileAuthenticator(Authenticator): if not pline.startswith("password="): raise AuthLoadError("Second line must start with 'password='") - self._username = uline[:9] - self._password = pline[:9] + self._username = uline[9:] + self._password = pline[9:] async def credentials(self) -> Tuple[str, str]: return self._username, self._password From 85b9f45085b409357f3c509da7f2719f63e5d2f6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 09:49:30 +0000 Subject: [PATCH 300/524] Bump version to 3.0.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7e33ae..87c1d05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.0.1 - 2021-06-01 + ### Added - `credential-file` authenticator - `--credential-file` option for `kit-ilias-web` command diff --git a/PFERD/version.py b/PFERD/version.py index e26dabb..2aae99d 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.0.0" +VERSION = "3.0.1" From 1fc8e9eb7ad99ad8c950c76398aab64b05c7d801 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 10:00:59 +0000 Subject: [PATCH 301/524] Document credential file authenticator config options --- CONFIG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 7826b04..feeade3 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -182,8 +182,11 @@ via the terminal. ### The `credential-file` authenticator -This authenticator reads a username and a password from a credential file. The -credential file has exactly two lines (trailing newline optional). The first +This authenticator reads a username and a password from a credential file. + +- `path`: Path to the credential file. (Required) + +The credential file has exactly two lines (trailing newline optional). The first line starts with `username=` and contains the username, the second line starts with `password=` and contains the password. The username and password may contain any characters except a line break. From 31b6311e993439b2bbb087511ca012e140003d9e Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 19:02:55 +0200 Subject: [PATCH 302/524] Remove incorrect tmp file explain message --- PFERD/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 5ae62bb..b274b6b 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -147,7 +147,6 @@ def main() -> None: log.unlock() log.explain_topic("Interrupted, exiting immediately") log.explain("Open files and connections are left for the OS to clean up") - log.explain("Temporary files are not cleaned up") pferd.print_report() # TODO Clean up tmp files # And when those files *do* actually get cleaned up properly, From fc31100a0f6e1933cf084e46898ad20d33d892b9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 4 Jun 2021 18:02:45 +0200 Subject: [PATCH 303/524] Always use '/' as path separator for regex rules Previously, regex-matching paths on windows would, in some cases, require four backslashes ('\\\\') to escape a single path separator. That's just too much. With this commit, regex transforms now use '/' instead of '\' as path separator, meaning rules can more easily be shared between platforms (although they are not guaranteed to be 100% compatible since on Windows, '\' is still recognized as a path separator). To make rules more intuitive to write, local relative paths are now also printed with '/' as path separator on Windows. Since Windows also accepts '/' as path separator, this change doesn't really affect other rules that parse their sides as paths. --- CHANGELOG.md | 3 +++ PFERD/transformer.py | 4 ++-- PFERD/utils.py | 8 +++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87c1d05..980f96e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Use `/` instead of `\` as path separator for (regex) rules on Windows + ## 3.0.1 - 2021-06-01 ### Added diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 83ffde4..ed123eb 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -10,7 +10,7 @@ from pathlib import PurePath from typing import Dict, Optional, Sequence, Union from .logging import log -from .utils import fmt_path +from .utils import fmt_path, str_path class Rule(ABC): @@ -116,7 +116,7 @@ class ReRule(Rule): self._right = right def transform(self, path: PurePath) -> Union[PurePath, bool]: - if match := re.fullmatch(self._left, str(path)): + if match := re.fullmatch(self._left, str_path(path)): if isinstance(self._right, bool): return self._right or path diff --git a/PFERD/utils.py b/PFERD/utils.py index 397feda..7c7b6f4 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -91,8 +91,14 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str: return result +def str_path(path: PurePath) -> str: + if not path.parts: + return "." + return "/".join(path.parts) + + def fmt_path(path: PurePath) -> str: - return repr(str(path)) + return repr(str_path(path)) def fmt_real_path(path: Path) -> str: From df3ad3d890e0c7e21fbb68305f3c1016f58c2523 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 4 Jun 2021 18:33:02 +0200 Subject: [PATCH 304/524] Add 'skip' option to crawlers --- CHANGELOG.md | 3 +++ CONFIG.md | 3 +++ PFERD/auth/authenticator.py | 6 +++++- PFERD/crawl/__init__.py | 2 +- PFERD/crawl/crawler.py | 9 +++++++++ PFERD/pferd.py | 39 +++++++++++++++++++++++++------------ 6 files changed, 48 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 980f96e..32cbe77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- `skip` option for crawlers + ### Changed - Use `/` instead of `\` as path separator for (regex) rules on Windows diff --git a/CONFIG.md b/CONFIG.md index feeade3..2f18be1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are common to all crawlers: - `type`: The available types are specified in [this section](#crawler-types). +- `skip`: Whether the crawler should be skipped during normal execution. The + crawler can still be executed manually using the `--crawler` or `-C` flags. + (Default: `no`) - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: the crawler's name) - `redownload`: When to download a file that is already present locally. diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index f588bc4..643a2d5 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -13,7 +13,11 @@ class AuthError(Exception): class AuthSection(Section): - pass + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value class Authenticator(ABC): diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 297c490..7eb2fb1 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -3,7 +3,7 @@ from typing import Callable, Dict from ..auth import Authenticator from ..config import Config -from .crawler import Crawler, CrawlError # noqa: F401 +from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index e990f16..d61783f 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): class CrawlerSection(Section): + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value + + def skip(self) -> bool: + return self.s.getboolean("skip", fallback=False) + def output_dir(self, name: str) -> Path: # TODO Use removeprefix() after switching to 3.9 if name.startswith("crawl:"): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index ac373cf..d98b426 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,9 +3,9 @@ from typing import Dict, List, Optional from rich.markup import escape -from .auth import AUTHENTICATORS, Authenticator, AuthError +from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection from .config import Config, ConfigOptionError -from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler +from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler from .logging import log from .utils import fmt_path @@ -26,19 +26,22 @@ class Pferd: self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} - def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: - log.explain_topic("Deciding which crawlers to run") - crawl_sections = [name for name, _ in config.crawl_sections()] + def _find_config_crawlers(self, config: Config) -> List[str]: + crawl_sections = [] - if cli_crawlers is None: - log.explain("No crawlers specified on CLI") - log.explain("Running all crawlers specified in config") - return crawl_sections + for name, section in config.crawl_sections(): + if CrawlerSection(section).skip(): + log.explain(f"Skipping {name!r}") + else: + crawl_sections.append(name) + return crawl_sections + + def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: if len(cli_crawlers) != len(set(cli_crawlers)): raise PferdLoadError("Some crawlers were selected multiple times") - log.explain("Crawlers specified on CLI") + crawl_sections = [name for name, _ in config.crawl_sections()] crawlers_to_run = [] # With crawl: prefix unknown_names = [] # Without crawl: prefix @@ -62,10 +65,22 @@ class Pferd: return crawlers_to_run + def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + + if cli_crawlers is None: + log.explain("No crawlers specified on CLI") + log.explain("Running crawlers specified in config") + return self._find_config_crawlers(config) + else: + log.explain("Crawlers specified on CLI") + return self._find_cli_crawlers(config, cli_crawlers) + def _load_authenticators(self) -> None: for name, section in self._config.auth_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - auth_type = section.get("type") + + auth_type = AuthSection(section).type() authenticator_constructor = AUTHENTICATORS.get(auth_type) if authenticator_constructor is None: raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") @@ -80,7 +95,7 @@ class Pferd: for name, section in self._config.crawl_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - crawl_type = section.get("type") + crawl_type = CrawlerSection(section).type() crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") From 8ab462fb87e8bdfac8bfd6821645dd9f4617e898 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 4 Jun 2021 19:23:33 +0200 Subject: [PATCH 305/524] Use the exercise label instead of the button name as path --- CHANGELOG.md | 2 ++ PFERD/crawl/ilias/kit_ilias_html.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32cbe77..171a61c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ ambiguous situations. ### Changed - Use `/` instead of `\` as path separator for (regex) rules on Windows +- Use the label to the left for exercises instead of the button name to + determine the folder name ## 3.0.1 - 2021-06-01 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 64491f9..db9a303 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -293,7 +293,13 @@ class IliasPage: # Add each listing as a new for listing in file_listings: - file_name = _sanitize_path_name(listing.getText().strip()) + parent_container: Tag = listing.findParent( + "div", attrs={"class": lambda x: x and "form-group" in x} + ) + label_container: Tag = parent_container.find( + attrs={"class": lambda x: x and "control-label" in x} + ) + file_name = _sanitize_path_name(label_container.getText().strip()) url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") results.append(IliasPageElement( From 61d902d7153f2942e24f92bd9e0a35e39be05563 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 17:42:38 +0200 Subject: [PATCH 306/524] Overhaul transform logic -re-> arrows now rename their parent directories (like -->) and don't require a full match (like -exact->). Their old behaviour is available as -exact-re->. Also, this change adds the ">>" arrow head, which modifies the current path and continues to the next rule when it matches. --- CHANGELOG.md | 3 + PFERD/transformer.py | 540 +++++++++++++++++++++++-------------------- 2 files changed, 298 insertions(+), 245 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 171a61c..ffc6e81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,8 +24,11 @@ ambiguous situations. ### Added - `skip` option for crawlers +- Rules with `>>` instead of `>` as arrow head +- `-exact-re->` arrow (behaves like `-re->` did previously) ### Changed +- The `-re->` arrow can now rename directories (like `-->`) - Use `/` instead of `\` as path separator for (regex) rules on Windows - Use the label to the left for exercises instead of the button name to determine the folder name diff --git a/PFERD/transformer.py b/PFERD/transformer.py index ed123eb..bf51d6a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -1,151 +1,159 @@ -# I'm sorry that this code has become a bit dense and unreadable. While -# reading, it is important to remember what True and False mean. I'd love to -# have some proper sum-types for the inputs and outputs, they'd make this code -# a lot easier to understand. - import ast import re from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum from pathlib import PurePath -from typing import Dict, Optional, Sequence, Union +from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union from .logging import log from .utils import fmt_path, str_path -class Rule(ABC): - @abstractmethod - def transform(self, path: PurePath) -> Union[PurePath, bool]: - """ - Try to apply this rule to the path. Returns another path if the rule - was successfully applied, True if the rule matched but resulted in an - exclamation mark, and False if the rule didn't match at all. - """ +class ArrowHead(Enum): + NORMAL = 0 + SEQUENCE = 1 + +class Ignore: + pass + + +class Empty: + pass + + +RightSide = Union[str, Ignore, Empty] + + +@dataclass +class Transformed: + path: PurePath + + +class Ignored: + pass + + +TransformResult = Optional[Union[Transformed, Ignored]] + + +@dataclass +class Rule: + left: str + name: str + head: ArrowHead + right: RightSide + + def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: + if isinstance(self.right, str): + return self.right + elif isinstance(self.right, Ignore): + return Ignored() + elif isinstance(self.right, Empty): + return Transformed(path) + else: + raise RuntimeError(f"Right side has invalid type {type(self.right)}") + + +class Transformation(ABC): + def __init__(self, rule: Rule): + self.rule = rule + + @abstractmethod + def transform(self, path: PurePath) -> TransformResult: pass -# These rules all use a Union[T, bool] for their right side. They are passed a -# T if the arrow's right side was a normal string, True if it was an -# exclamation mark and False if it was missing entirely. - -class NormalRule(Rule): - def __init__(self, left: PurePath, right: Union[PurePath, bool]): - - self._left = left - self._right = right - - def _match_prefix(self, path: PurePath) -> Optional[PurePath]: - left_parts = list(reversed(self._left.parts)) - path_parts = list(reversed(path.parts)) - - if len(left_parts) > len(path_parts): +class ExactTf(Transformation): + def transform(self, path: PurePath) -> TransformResult: + if path != PurePath(self.rule.left): return None - while left_parts and path_parts: - left_part = left_parts.pop() - path_part = path_parts.pop() + right = self.rule.right_result(path) + if not isinstance(right, str): + return right - if left_part != path_part: - return None + return Transformed(PurePath(right)) - if left_parts: + +class ExactReTf(Transformation): + def transform(self, path: PurePath) -> TransformResult: + match = re.fullmatch(self.rule.left, str_path(path)) + if not match: return None - path_parts.reverse() - return PurePath(*path_parts) + right = self.rule.right_result(path) + if not isinstance(right, str): + return right - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if rest := self._match_prefix(path): - if isinstance(self._right, bool): - return self._right or path + # For some reason, mypy thinks that "groups" has type List[str]. But + # since elements of "match.groups()" can be None, mypy is wrong. + groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) + + locals_dir: Dict[str, Union[str, int, float]] = {} + for i, group in enumerate(groups): + if group is None: + continue + + locals_dir[f"g{i}"] = group + + try: + locals_dir[f"i{i}"] = int(group) + except ValueError: + pass + + try: + locals_dir[f"f{i}"] = float(group) + except ValueError: + pass + + result = eval(f"f{right!r}", {}, locals_dir) + return Transformed(PurePath(result)) + + +class RenamingParentsTf(Transformation): + def __init__(self, sub_tf: Transformation): + super().__init__(sub_tf.rule) + self.sub_tf = sub_tf + + def transform(self, path: PurePath) -> TransformResult: + for i in range(len(path.parts), -1, -1): + parent = PurePath(*path.parts[:i]) + child = PurePath(*path.parts[i:]) + + transformed = self.sub_tf.transform(parent) + if not transformed: + continue + elif isinstance(transformed, Transformed): + return Transformed(transformed.path / child) + elif isinstance(transformed, Ignored): + return transformed else: - return self._right / rest + raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - return False + return None -class ExactRule(Rule): - def __init__(self, left: PurePath, right: Union[PurePath, bool]): - self._left = left - self._right = right +class RenamingPartsTf(Transformation): + def __init__(self, sub_tf: Transformation): + super().__init__(sub_tf.rule) + self.sub_tf = sub_tf - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if path == self._left: - if isinstance(self._right, bool): - return self._right or path - else: - return self._right - - return False - - -class NameRule(Rule): - def __init__(self, subrule: Rule): - self._subrule = subrule - - def transform(self, path: PurePath) -> Union[PurePath, bool]: - matched = False + def transform(self, path: PurePath) -> TransformResult: result = PurePath() - for part in path.parts: - part_result = self._subrule.transform(PurePath(part)) - if isinstance(part_result, PurePath): - matched = True - result /= part_result - elif part_result: - # If any subrule call ignores its path segment, the entire path - # should be ignored - return True - else: - # The subrule doesn't modify this segment, but maybe other - # segments + transformed = self.sub_tf.transform(PurePath(part)) + if not transformed: result /= part + elif isinstance(transformed, Transformed): + result /= transformed.path + elif isinstance(transformed, Ignored): + return transformed + else: + raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - if matched: - return result - else: - # The subrule has modified no segments, so this name version of it - # doesn't match - return False - - -class ReRule(Rule): - def __init__(self, left: str, right: Union[str, bool]): - self._left = left - self._right = right - - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if match := re.fullmatch(self._left, str_path(path)): - if isinstance(self._right, bool): - return self._right or path - - vars: Dict[str, Union[str, int, float]] = {} - - # For some reason, mypy thinks that "groups" has type List[str]. - # But since elements of "match.groups()" can be None, mypy is - # wrong. - groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) - for i, group in enumerate(groups): - if group is None: - continue - - vars[f"g{i}"] = group - - try: - vars[f"i{i}"] = int(group) - except ValueError: - pass - - try: - vars[f"f{i}"] = float(group) - except ValueError: - pass - - result = eval(f"f{self._right!r}", vars) - return PurePath(result) - - return False + return None class RuleParseError(Exception): @@ -162,18 +170,15 @@ class RuleParseError(Exception): log.error_contd(f"{spaces}^--- {self.reason}") +T = TypeVar("T") + + class Line: def __init__(self, line: str, line_nr: int): self._line = line self._line_nr = line_nr self._index = 0 - def get(self) -> Optional[str]: - if self._index < len(self._line): - return self._line[self._index] - - return None - @property def line(self) -> str: return self._line @@ -190,155 +195,192 @@ class Line: def index(self, index: int) -> None: self._index = index - def advance(self) -> None: - self._index += 1 + @property + def rest(self) -> str: + return self.line[self.index:] - def expect(self, string: str) -> None: - for char in string: - if self.get() == char: - self.advance() - else: - raise RuleParseError(self, f"Expected {char!r}") + def peek(self, amount: int = 1) -> str: + return self.rest[:amount] + + def take(self, amount: int = 1) -> str: + string = self.peek(amount) + self.index += len(string) + return string + + def expect(self, string: str) -> str: + if self.peek(len(string)) == string: + return self.take(len(string)) + else: + raise RuleParseError(self, f"Expected {string!r}") + + def expect_with(self, string: str, value: T) -> T: + self.expect(string) + return value + + def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: + for parser in parsers: + index = self.index + try: + return parser() + except RuleParseError: + self.index = index + + raise RuleParseError(self, description) + + +# RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? +# SPACE = ' '+ +# NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' +# HEAD = '>' | '>>' +# LEFT = STR | QUOTED_STR +# RIGHT = STR | QUOTED_STR | '!' + + +def parse_zero_or_more_spaces(line: Line) -> None: + while line.peek() == " ": + line.take() + + +def parse_one_or_more_spaces(line: Line) -> None: + line.expect(" ") + parse_zero_or_more_spaces(line) + + +def parse_str(line: Line) -> str: + result = [] + while c := line.peek(): + if c == " ": + break + else: + line.take() + result.append(c) + + if result: + return "".join(result) + else: + raise RuleParseError(line, "Expected non-space character") QUOTATION_MARKS = {'"', "'"} -def parse_string_literal(line: Line) -> str: +def parse_quoted_str(line: Line) -> str: escaped = False # Points to first character of string literal start_index = line.index - quotation_mark = line.get() + quotation_mark = line.peek() if quotation_mark not in QUOTATION_MARKS: - # This should never happen as long as this function is only called from - # parse_string. - raise RuleParseError(line, "Invalid quotation mark") - line.advance() + raise RuleParseError(line, "Expected quotation mark") + line.take() - while c := line.get(): + while c := line.peek(): if escaped: escaped = False - line.advance() + line.take() elif c == quotation_mark: - line.advance() + line.take() stop_index = line.index literal = line.line[start_index:stop_index] - return ast.literal_eval(literal) + try: + return ast.literal_eval(literal) + except SyntaxError as e: + line.index = start_index + raise RuleParseError(line, str(e)) from e elif c == "\\": escaped = True - line.advance() + line.take() else: - line.advance() + line.take() raise RuleParseError(line, "Expected end of string literal") -def parse_until_space_or_eol(line: Line) -> str: - result = [] - while c := line.get(): - if c == " ": - break - result.append(c) - line.advance() - - return "".join(result) - - -def parse_string(line: Line) -> Union[str, bool]: - if line.get() in QUOTATION_MARKS: - return parse_string_literal(line) +def parse_left(line: Line) -> str: + if line.peek() in QUOTATION_MARKS: + return parse_quoted_str(line) else: - string = parse_until_space_or_eol(line) + return parse_str(line) + + +def parse_right(line: Line) -> Union[str, Ignore]: + c = line.peek() + if c in QUOTATION_MARKS: + return parse_quoted_str(line) + else: + string = parse_str(line) if string == "!": - return True + return Ignore() return string -def parse_arrow(line: Line) -> str: - line.expect("-") - - name = [] - while True: - c = line.get() - if not c: - raise RuleParseError(line, "Expected rest of arrow") - elif c == "-": - line.advance() - c = line.get() - if not c: - raise RuleParseError(line, "Expected rest of arrow") - elif c == ">": - line.advance() - break # End of arrow - else: - name.append("-") - continue - else: - name.append(c) - - line.advance() - - return "".join(name) +def parse_arrow_name(line: Line) -> str: + return line.one_of([ + lambda: line.expect("exact-re"), + lambda: line.expect("exact"), + lambda: line.expect("name-re"), + lambda: line.expect("name"), + lambda: line.expect("re"), + lambda: line.expect(""), + ], "Expected arrow name") -def parse_whitespace(line: Line) -> None: - line.expect(" ") - while line.get() == " ": - line.advance() +def parse_arrow_head(line: Line) -> ArrowHead: + return line.one_of([ + lambda: line.expect_with(">>", ArrowHead.SEQUENCE), + lambda: line.expect_with(">", ArrowHead.NORMAL), + ], "Expected arrow head") def parse_eol(line: Line) -> None: - if line.get() is not None: + if line.peek(): raise RuleParseError(line, "Expected end of line") def parse_rule(line: Line) -> Rule: - # Parse left side - leftindex = line.index - left = parse_string(line) - if isinstance(left, bool): - line.index = leftindex - raise RuleParseError(line, "Left side can't be '!'") - leftpath = PurePath(left) + parse_zero_or_more_spaces(line) + left = parse_left(line) - # Parse arrow - parse_whitespace(line) - arrowindex = line.index - arrowname = parse_arrow(line) + parse_one_or_more_spaces(line) - # Parse right side - if line.get(): - parse_whitespace(line) - right = parse_string(line) + line.expect("-") + name = parse_arrow_name(line) + line.expect("-") + head = parse_arrow_head(line) + + index = line.index + right: RightSide + try: + parse_zero_or_more_spaces(line) + parse_eol(line) + right = Empty() + except RuleParseError: + line.index = index + parse_one_or_more_spaces(line) + right = parse_right(line) + parse_eol(line) + + return Rule(left, name, head, right) + + +def parse_transformation(line: Line) -> Transformation: + rule = parse_rule(line) + + if rule.name == "": + return RenamingParentsTf(ExactTf(rule)) + elif rule.name == "exact": + return ExactTf(rule) + elif rule.name == "name": + return RenamingPartsTf(ExactTf(rule)) + elif rule.name == "re": + return RenamingParentsTf(ExactReTf(rule)) + elif rule.name == "exact-re": + return ExactReTf(rule) + elif rule.name == "name-re": + return RenamingPartsTf(ExactReTf(rule)) else: - right = False - rightpath: Union[PurePath, bool] - if isinstance(right, bool): - rightpath = right - else: - rightpath = PurePath(right) - - parse_eol(line) - - # Dispatch - if arrowname == "": - return NormalRule(leftpath, rightpath) - elif arrowname == "name": - if len(leftpath.parts) > 1: - line.index = leftindex - raise RuleParseError(line, "SOURCE must be a single name, not multiple segments") - return NameRule(ExactRule(leftpath, rightpath)) - elif arrowname == "exact": - return ExactRule(leftpath, rightpath) - elif arrowname == "re": - return ReRule(left, right) - elif arrowname == "name-re": - return NameRule(ReRule(left, right)) - else: - line.index = arrowindex + 1 # For nicer error message - raise RuleParseError(line, f"Invalid arrow name {arrowname!r}") + raise RuntimeError(f"Invalid arrow name {rule.name!r}") class Transformer: @@ -347,32 +389,40 @@ class Transformer: May throw a RuleParseException. """ - self._rules = [] + self._tfs = [] for i, line in enumerate(rules.split("\n")): line = line.strip() if line: - rule = parse_rule(Line(line, i)) - self._rules.append((line, rule)) + tf = parse_transformation(Line(line, i)) + self._tfs.append((line, tf)) def transform(self, path: PurePath) -> Optional[PurePath]: - for i, (line, rule) in enumerate(self._rules): + for i, (line, tf) in enumerate(self._tfs): log.explain(f"Testing rule {i+1}: {line}") try: - result = rule.transform(path) + result = tf.transform(path) except Exception as e: log.warn(f"Error while testing rule {i+1}: {line}") log.warn_contd(str(e)) continue - if isinstance(result, PurePath): - log.explain(f"Match found, transformed path to {fmt_path(result)}") - return result - elif result: # Exclamation mark - log.explain("Match found, path ignored") - return None - else: + if not result: continue - log.explain("No rule matched, path is unchanged") + if isinstance(result, Ignored): + log.explain("Match found, path ignored") + return None + + if tf.rule.head == ArrowHead.NORMAL: + log.explain(f"Match found, transformed path to {fmt_path(result.path)}") + path = result.path + break + elif tf.rule.head == ArrowHead.SEQUENCE: + log.explain(f"Match found, updated path to {fmt_path(result.path)}") + path = result.path + else: + raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") + + log.explain(f"Final result: {fmt_path(path)}") return path From f28bbe6b0c11c165ad604b6ab33730a37800604a Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 22:22:40 +0200 Subject: [PATCH 307/524] Update transform rule documentation It's still missing an example that uses rules with ">>" arrows. --- CONFIG.md | 128 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 40 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 2f18be1..1793ddc 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -222,56 +222,87 @@ This authenticator does not support usernames. Transformation rules are rules for renaming and excluding files and directories. They are specified line-by-line in a crawler's `transform` option. When a crawler needs to apply a rule to a path, it goes through this list top-to-bottom -and choose the first matching rule. +and applies the first matching rule. To see this process in action, you can use the `--debug-transforms` or flag or the `--explain` flag. -Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. -`SOURCE` is either a normal path without spaces (e. g. `foo/bar`), or a string -literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string -escape syntax is supported. Trailing slashes are ignored. `TARGET` can be -formatted like `SOURCE`, but it can also be a single exclamation mark without -quotes (`!`). `ARROW` is one of `-->`, `-name->`, `-exact->`, `-re->` and -`-name-re->` +Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). +The arrow specifies how the source and target are interpreted. The different +kinds of arrows are documented below. -If a rule's target is `!`, this means that when the rule matches on a path, the -corresponding file or directory is ignored. If a rule's target is missing, the -path is matched but not modified. +`SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. +`foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no +concept of escaping characters, so the backslash is just another character. The +string literals however support Python's escape syntax (e. g. +`"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be +escaped. + +`TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a +rule with a `!` as target matches a path, the corresponding file or directory is +ignored by the crawler instead of renamed. + +`TARGET` can also be omitted entirely. When a rule without target matches a +path, the path is returned unmodified. This is useful to prevent rules further +down from matching instead. + +Each arrow's behaviour can be modified slightly by changing the arrow's head +from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't +return immediately like a normal arrow. Instead, it replaces the current path +with its output and continues on to the next rule. In effect, this means that +multiple rules can be applied sequentially. ### The `-->` arrow -The `-->` arrow is a basic renaming operation. If a path begins with `SOURCE`, -that part of the path is replaced with `TARGET`. This means that the rule -`foo/bar --> baz` would convert `foo/bar` into `baz`, but also `foo/bar/xyz` -into `baz/xyz`. The rule `foo --> !` would ignore a directory named `foo` as -well as all its contents. +The `-->` arrow is a basic renaming operation for files and directories. If a +path matches `SOURCE`, it is renamed to `TARGET`. + +Example: `foo/bar --> baz` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Converts `foo/bar` into `baz` +- Converts `foo/bar/wargl` into `bar/wargl` + +Example: `foo/bar --> !` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Ignores `foo/bar` and any of its children ### The `-name->` arrow The `-name->` arrow lets you rename files and directories by their name, regardless of where they appear in the file tree. Because of this, its `SOURCE` must not contain multiple path segments, only a single name. This restriction -does not apply to its `TARGET`. The `-name->` arrow is not applied recursively -to its own output to prevent infinite loops. +does not apply to its `TARGET`. -For example, the rule `foo -name-> bar/baz` would convert `a/foo` into -`a/bar/baz` and `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz`. The rule `foo --name-> !` would ignore all directories and files named `foo`. +Example: `foo -name-> bar/baz` +- Doesn't match `a/foobar/b` or `x/Foo/y/z` +- Converts `hello/foo` into `hello/bar/baz` +- Converts `foo/world` into `bar/baz/world` +- Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` + +Example: `foo -name-> !` +- Doesn't match `a/foobar/b` or `x/Foo/y/z` +- Ignores any path containing a segment `foo` ### The `-exact->` arrow -The `-exact->` arrow requires the path to match `SOURCE` exactly. This means -that the rule `foo/bar -exact-> baz` would still convert `foo/bar` into `baz`, -but `foo/bar/xyz` would be unaffected. Also, `foo -exact-> !` would only ignore -`foo`, but not its contents (if it has any). The examples below show why this is -useful. +The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples +below show why this is useful. + +Example: `foo/bar -exact-> baz` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Converts `foo/bar` into `baz` +- Doesn't match `foo/bar/wargl` + +Example: `foo/bar -exact-> !` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Ignores only `foo/bar`, not its children ### The `-re->` arrow -The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression -that must match the entire path. If this is the case, then the capturing groups -are available in `TARGET` for formatting. +The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` +is a regular expression and `TARGET` an f-string based template. If a path +matches `SOURCE`, the output path is created using `TARGET` as template. +`SOURCE` is automatically anchored. `TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. @@ -288,18 +319,36 @@ can use `{i3:05}`. PFERD even allows you to write entire expressions inside the curly braces, for example `{g2.lower()}` or `{g3.replace(' ', '_')}`. +Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` +- Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` +- Converts `foo/bar` into `BOOH/fear` +- Converts `fooooo/bear` into `BOOOOOH/fear` +- Converts `foo/bar/baz` into `BOOH/fear/baz` + [3]: "Format String Syntax" ### The `-name-re->` arrow The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. -Instead of the `SOURCE` being the name of a directory or file, it's a regex that -is matched against the names of directories and files. `TARGET` works like the -`-re->` arrow's target. -For example, the arrow `(.*)\.jpeg -name-re-> {g1}.jpg` will rename all `.jpeg` -extensions into `.jpg`. The arrow `\..+ -name-re-> !` will ignore all files and -directories starting with `.`. +Example: `(.*)\.jpeg -name-re-> {g1}.jpg` +- Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` +- Converts `foo/bar.jpeg` into `foo/bar.jpg` +- Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` + +Example: `\..+ -name-re-> !` +- Doesn't match `.`, `test`, `a.b` +- Ignores all files and directories starting with `.`. + +### The `-exact-re->` arrow + +The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` arrows. + +Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` +- Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` +- Converts `foo/bar` into `BOOH/fear` +- Converts `fooooo/bear` into `BOOOOOH/fear` +- Doesn't match `foo/bar/baz` ### Example: Tutorials @@ -327,7 +376,7 @@ The second rule is required for many crawlers since they use the rules to decide which directories to crawl. If it was missing when the crawler looks at `tutorials/`, the third rule would match. This means the crawler would not crawl the `tutorials/` directory and thus not discover that `tutorials/tut02/` -existed. +exists. Since the second rule is only relevant for crawling, the `TARGET` is left out. @@ -352,9 +401,9 @@ To do this, you can use the most powerful of arrows: The regex arrow. Note the escaped backslashes on the `SOURCE` side. -### Example: Crawl a python project +### Example: Crawl a Python project -You are crawling a python project and want to ignore all hidden files (files +You are crawling a Python project and want to ignore all hidden files (files whose name starts with a `.`), all `__pycache__` directories and all markdown files (for some weird reason). @@ -374,8 +423,7 @@ README.md ... ``` -For this task, the name arrows can be used. They are variants of the normal -arrows that only look at the file name instead of the entire path. +For this task, the name arrows can be used. ``` \..* -name-re-> ! From bc65ea7ab696bf3f455c49bad4ae4375a75182a8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 22:35:55 +0200 Subject: [PATCH 308/524] Fix mypy complaining about missing type hints --- scripts/setup | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/setup b/scripts/setup index b48fb1a..f6680bb 100755 --- a/scripts/setup +++ b/scripts/setup @@ -12,6 +12,6 @@ pip install --upgrade setuptools # Installing PFERD itself pip install --editable . -# Installing various tools -pip install --upgrade mypy flake8 autopep8 isort -pip install --upgrade pyinstaller +# Installing tools and type hints +pip install --upgrade mypy flake8 autopep8 isort pyinstaller +pip install --upgrade types-chardet types-certifi From a292c4c437d631d7eae3a0adfd98adbefd52c2eb Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 12 Jun 2021 14:57:29 +0200 Subject: [PATCH 309/524] Add example for ">>" arrow heads --- CONFIG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 1793ddc..f2710e1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -430,3 +430,14 @@ For this task, the name arrows can be used. __pycache__ -name-> ! .*\.md -name-re-> ! ``` + +### Example: Clean up names + +You want to convert all paths into lowercase and replace spaces with underscores +before applying any rules. This can be achieved using the `>>` arrow heads. + +``` +(.*) -re->> "{g1.lower().replace(' ', '_')}" + + +``` From 601e4b936b320e766c0de18d384a92a5750f72b9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 12 Jun 2021 15:00:52 +0200 Subject: [PATCH 310/524] Use new arrow logic in README example config --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d25e86f..681bdf7 100644 --- a/README.md +++ b/README.md @@ -116,17 +116,18 @@ transform = Online-Tests --> ! Vorlesungswerbung --> ! + # Rename folders + Lehrbücher --> Vorlesung + # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" + Übungsunterlagen -->> Übung + # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly - "Übungsunterlagen/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf + "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly - "Übungsunterlagen/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf + "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf # The course has nested folders with the same name - flatten them - "Übungsunterlagen/(.+?)/\\1/(.*)" -re-> Übung/{g1}/{g2} - - # Rename remaining folders - Übungsunterlagen --> Übung - Lehrbücher --> Vorlesung + "Übung/(.+?)/\\1" -re-> Übung/{g1} [crawl:Bar] type = kit-ilias-web From 70b33ecfd9ca3230303cc17f39fd8bc634737e2b Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 15:06:50 +0200 Subject: [PATCH 311/524] Add migration notes to changelog Also clean up some other formatting for consistency --- CHANGELOG.md | 5 +++++ CONFIG.md | 6 +++--- README.md | 6 +++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc6e81..d6049d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,11 @@ ambiguous situations. ## Unreleased +If your config file doesn't do weird things with transforms, it should continue +to work. If your `-re->` arrows behave weirdly, try replacing them with +`-exact-re->` arrows. If you're on Windows, you might need to switch from `\` +path separators to `/` in your regex rules. + ### Added - `skip` option for crawlers - Rules with `>>` instead of `>` as arrow head diff --git a/CONFIG.md b/CONFIG.md index f2710e1..19afbd2 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -342,7 +342,8 @@ Example: `\..+ -name-re-> !` ### The `-exact-re->` arrow -The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` arrows. +The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` +arrows. Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` @@ -375,8 +376,7 @@ tutorials --> ! The second rule is required for many crawlers since they use the rules to decide which directories to crawl. If it was missing when the crawler looks at `tutorials/`, the third rule would match. This means the crawler would not crawl -the `tutorials/` directory and thus not discover that `tutorials/tut02/` -exists. +the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. Since the second rule is only relevant for crawling, the `TARGET` is left out. diff --git a/README.md b/README.md index 681bdf7..836147f 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. ## Basic usage -PFERD can be run directly from the command line with no config file. -Run `pferd -h` to get an overview of available commands and options. -Run `pferd -h` to see which options a command has. +PFERD can be run directly from the command line with no config file. Run `pferd +-h` to get an overview of available commands and options. Run `pferd +-h` to see which options a command has. For example, you can download your personal desktop from the KIT ILIAS like this: From 70ec64a48ba8a56a819dfdbacba974f108d1206e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 15:39:22 +0200 Subject: [PATCH 312/524] Fix wrong base URL for multi-stage pages --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6049d2..c09f921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,9 @@ path separators to `/` in your regex rules. - Use the label to the left for exercises instead of the button name to determine the folder name +### Fixed +- Video pagination handling in ILIAS crawler + ## 3.0.1 - 2021-06-01 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index db9a303..384f0de 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -480,7 +480,7 @@ class IliasPage: return None if "opencast" in str(img_tag["alt"]).lower(): - return IliasElementType.VIDEO_FOLDER + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if str(img_tag["src"]).endswith("icon_exc.svg"): return IliasElementType.EXERCISE diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 78428e0..6495da9 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -253,7 +253,7 @@ instance's greatest bottleneck. soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, url, parent) + page = IliasPage(soup, next_stage_url, parent) next_stage_url = page.get_next_stage_url() elements.extend(page.get_child_elements()) From 57aef262179f72795e30f1c93254a32f084c0e23 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 16:32:22 +0200 Subject: [PATCH 313/524] Fix name arrows I seem to have (re-)implemented them incorrectly and never tested them. --- PFERD/transformer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index bf51d6a..a37443a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -142,18 +142,23 @@ class RenamingPartsTf(Transformation): def transform(self, path: PurePath) -> TransformResult: result = PurePath() + any_part_matched = False for part in path.parts: transformed = self.sub_tf.transform(PurePath(part)) if not transformed: result /= part elif isinstance(transformed, Transformed): result /= transformed.path + any_part_matched = True elif isinstance(transformed, Ignored): return transformed else: raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - return None + if any_part_matched: + return Transformed(result) + else: + return None class RuleParseError(Exception): From 6e4d423c812c52aff95249ad992dc4889d971208 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 16:50:29 +0200 Subject: [PATCH 314/524] Crawl all video stages in one crawl bar This ensures folders are not renamed, as they are crawled twice --- PFERD/crawl/ilias/kit_ilias_html.py | 6 ++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 384f0de..41f45e2 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -62,9 +62,11 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() - def get_next_stage_url(self) -> Optional[str]: + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): - return self.get_child_elements()[0].url + return self.get_child_elements()[0] + if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + return self._find_video_entries_paginated()[0] return None def _is_video_player(self) -> bool: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 6495da9..41c301c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -248,13 +248,18 @@ instance's greatest bottleneck. elements.clear() async with cl: next_stage_url: Optional[str] = url + current_parent = parent while next_stage_url: soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, next_stage_url, parent) - next_stage_url = page.get_next_stage_url() + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None elements.extend(page.get_child_elements()) From 75fde870c2cc4b0f8b87c80cae87e61f9379ddd2 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 17:23:18 +0200 Subject: [PATCH 315/524] Bump version to 3.1.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c09f921..427219e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.1.0 - 2021-06-13 + If your config file doesn't do weird things with transforms, it should continue to work. If your `-re->` arrows behave weirdly, try replacing them with `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` diff --git a/PFERD/version.py b/PFERD/version.py index 2aae99d..8ce7ae4 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.0.1" +VERSION = "3.1.0" From 80eeb8fe97e28437dcce0e148ffba202fde6a156 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 1 Jul 2021 11:01:55 +0200 Subject: [PATCH 316/524] Add --skip option --- PFERD/__main__.py | 2 +- PFERD/cli/parser.py | 8 ++++++++ PFERD/pferd.py | 24 +++++++++++++++++++----- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b274b6b..b665feb 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -116,7 +116,7 @@ def main() -> None: sys.exit() try: - pferd = Pferd(config, args.crawler) + pferd = Pferd(config, args.crawler, args.skip) except PferdLoadError as e: log.unlock() log.error(str(e)) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index f5fb215..e753023 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -181,6 +181,14 @@ PARSER.add_argument( help="only execute a single crawler." " Can be specified multiple times to execute multiple crawlers" ) +PARSER.add_argument( + "--skip", "-S", + action="append", + type=str, + metavar="NAME", + help="don't execute this particular crawler." + " Can be specified multiple times to skip multiple crawlers" +) PARSER.add_argument( "--working-dir", type=Path, diff --git a/PFERD/pferd.py b/PFERD/pferd.py index d98b426..726ed45 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -15,13 +15,13 @@ class PferdLoadError(Exception): class Pferd: - def __init__(self, config: Config, cli_crawlers: Optional[List[str]]): + def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): """ May throw PferdLoadError. """ self._config = config - self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers) + self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} @@ -65,16 +65,30 @@ class Pferd: return crawlers_to_run - def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + def _find_crawlers_to_run( + self, + config: Config, + cli_crawlers: Optional[List[str]], + cli_skips: Optional[List[str]], + ) -> List[str]: log.explain_topic("Deciding which crawlers to run") + crawlers: List[str] if cli_crawlers is None: log.explain("No crawlers specified on CLI") log.explain("Running crawlers specified in config") - return self._find_config_crawlers(config) + crawlers = self._find_config_crawlers(config) else: log.explain("Crawlers specified on CLI") - return self._find_cli_crawlers(config, cli_crawlers) + crawlers = self._find_cli_crawlers(config, cli_crawlers) + + skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() + for crawler in crawlers: + if crawler in skips: + log.explain(f"Skipping crawler {crawler!r}") + crawlers = [crawler for crawler in crawlers if crawler not in skips] + + return crawlers def _load_authenticators(self) -> None: for name, section in self._config.auth_sections(): From 9ffd6033575ed0ed603663e60bd00b8adb5b8295 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 1 Jul 2021 11:14:50 +0200 Subject: [PATCH 317/524] Error when using multiple segments with -name-> Previously, PFERD just silently never matched the -name-> arrow. Now, it errors when loading the config file. --- PFERD/transformer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index a37443a..1a56e27 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -41,9 +41,11 @@ TransformResult = Optional[Union[Transformed, Ignored]] @dataclass class Rule: left: str + left_index: int name: str head: ArrowHead right: RightSide + right_index: int def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: if isinstance(self.right, str): @@ -345,6 +347,7 @@ def parse_eol(line: Line) -> None: def parse_rule(line: Line) -> Rule: parse_zero_or_more_spaces(line) + left_index = line.index left = parse_left(line) parse_one_or_more_spaces(line) @@ -354,19 +357,19 @@ def parse_rule(line: Line) -> Rule: line.expect("-") head = parse_arrow_head(line) - index = line.index + right_index = line.index right: RightSide try: parse_zero_or_more_spaces(line) parse_eol(line) right = Empty() except RuleParseError: - line.index = index + line.index = right_index parse_one_or_more_spaces(line) right = parse_right(line) parse_eol(line) - return Rule(left, name, head, right) + return Rule(left, left_index, name, head, right, right_index) def parse_transformation(line: Line) -> Transformation: @@ -377,6 +380,9 @@ def parse_transformation(line: Line) -> Transformation: elif rule.name == "exact": return ExactTf(rule) elif rule.name == "name": + if len(PurePath(rule.left).parts) > 1: + line.index = rule.left_index + raise RuleParseError(line, "Expected name, not multiple segments") return RenamingPartsTf(ExactTf(rule)) elif rule.name == "re": return RenamingParentsTf(ExactReTf(rule)) From 91200f3684973f40d6409ce38368eceb6e73da0f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 3 Jul 2021 12:07:18 +0200 Subject: [PATCH 318/524] Fix nondeterministic name deduplication --- PFERD/crawl/crawler.py | 8 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 145 +++++++++++++-------- 2 files changed, 93 insertions(+), 60 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d61783f..d798bc3 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -56,7 +56,7 @@ def noncritical(f: Wrapped) -> Wrapped: return wrapper # type: ignore -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) def anoncritical(f: AWrapped) -> AWrapped: @@ -72,14 +72,14 @@ def anoncritical(f: AWrapped) -> AWrapped: Warning: Must only be applied to member functions of the Crawler class! """ - async def wrapper(*args: Any, **kwargs: Any) -> None: + async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: if not (args and isinstance(args[0], Crawler)): raise RuntimeError("@anoncritical must only applied to Crawler methods") crawler = args[0] try: - await f(*args, **kwargs) + return await f(*args, **kwargs) except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: log.warn(str(e)) crawler.error_free = False @@ -87,6 +87,8 @@ def anoncritical(f: AWrapped) -> AWrapped: crawler.error_free = False raise + return None + return wrapper # type: ignore diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 41c301c..a61eb4e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -12,7 +12,7 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlWarning, anoncritical +from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement @@ -81,17 +81,16 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: - async def wrapper(*args: Any, **kwargs: Any) -> None: + async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: last_exception: Optional[BaseException] = None for round in range(attempts): try: - await f(*args, **kwargs) - return + return await f(*args, **kwargs) except aiohttp.ContentTypeError: # invalid content type raise CrawlWarning("ILIAS returned an invalid content type") except aiohttp.TooManyRedirects: @@ -230,17 +229,33 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() - tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] + + tasks: List[Awaitable[None]] = [] + for element in elements: + if handle := await self._handle_ilias_element(PurePath("."), element): + tasks.append(asyncio.create_task(handle)) # And execute them await self.gather(tasks) - async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: + async def _handle_ilias_page( + self, + url: str, + parent: IliasPageElement, + path: PurePath, + ) -> Optional[Awaitable[None]]: maybe_cl = await self.crawl(path) if not maybe_cl: - return - cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None + return self._crawl_ilias_page(url, parent, path, maybe_cl) + async def _crawl_ilias_page( + self, + url: str, + parent: IliasPageElement, + path: PurePath, + cl: CrawlToken, + ) -> None: elements: List[IliasPageElement] = [] @_iorepeat(3, "crawling folder") @@ -265,7 +280,11 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() - tasks = [self._handle_ilias_element(cl.path, element) for element in elements] + + tasks: List[Awaitable[None]] = [] + for element in elements: + if handle := await self._handle_ilias_element(cl.path, element): + tasks.append(asyncio.create_task(handle)) # And execute them await self.gather(tasks) @@ -274,7 +293,11 @@ instance's greatest bottleneck. # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical. # If that happens we will be terminated as anoncritical doesn't tream them as non-critical. @_wrap_io_in_warning("handling ilias element") - async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: + async def _handle_ilias_element( + self, + parent_path: PurePath, + element: IliasPageElement, + ) -> Optional[Awaitable[None]]: element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: @@ -282,35 +305,41 @@ instance's greatest bottleneck. if not self._videos: log.explain("Video crawling is disabled") log.explain("Answer: no") - return + return None else: log.explain("Video crawling is enabled") log.explain("Answer: yes") if element.type == IliasElementType.FILE: - await self._download_file(element, element_path) + return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Forums are not supported") log.explain("Answer: No") + return None elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") log.explain("Answer: No") + return None elif element.type == IliasElementType.LINK: - await self._download_link(element, element_path) + return await self._handle_link(element, element_path) elif element.type == IliasElementType.VIDEO: - await self._download_file(element, element_path) + return await self._handle_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: - await self._download_video(element, element_path) + return await self._handle_video(element, element_path) elif element.type in _DIRECTORY_PAGES: - await self._handle_ilias_page(element.url, element, element_path) + return await self._handle_ilias_page(element.url, element, element_path) else: # This will retry it a few times, failing everytime. It doesn't make any network # requests, so that's fine. raise CrawlWarning(f"Unknown element type: {element.type!r}") - async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_link( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -318,32 +347,30 @@ instance's greatest bottleneck. link_extension = self._links.extension() if not link_template_maybe or not link_extension: log.explain("Answer: No") - return + return None else: log.explain("Answer: Yes") - link_template = link_template_maybe element_path = element_path.with_name(element_path.name + link_extension) maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None - @_iorepeat(3, "resolving link") - async def impl() -> None: - async with dl as (bar, sink): - export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") - real_url = await self._resolve_link_target(export_url) + return self._download_link(element, link_template_maybe, maybe_dl) - content = link_template - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + @_iorepeat(3, "resolving link") + async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: + async with dl as (bar, sink): + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") + real_url = await self._resolve_link_target(export_url) - await impl() + content = link_template + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() async def _resolve_link_target(self, export_url: str) -> str: async with self.session.get(export_url, allow_redirects=False) as resp: @@ -360,39 +387,43 @@ instance's greatest bottleneck. raise CrawlError("resolve_link_target failed even after authenticating") - async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_video( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None - @_iorepeat(3, "downloading video") - async def impl() -> None: - assert dl # The function is only reached when dl is not None - async with dl as (bar, sink): - page = IliasPage(await self._get_page(element.url), element.url, element) - real_element = page.get_child_elements()[0] + return self._download_video(element, maybe_dl) - log.explain(f"Streaming video from real url {real_element.url}") + @_iorepeat(3, "downloading video") + async def _download_video(self, element: IliasPageElement, dl: DownloadToken) -> None: + async with dl as (bar, sink): + page = IliasPage(await self._get_page(element.url), element.url, element) + real_element = page.get_child_elements()[0] - await self._stream_from_url(real_element.url, sink, bar, is_video=True) + log.explain(f"Streaming video from real url {real_element.url}") - await impl() + await self._stream_from_url(real_element.url, sink, bar, is_video=True) - async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_file( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None + return self._download_file(element, maybe_dl) - @_iorepeat(3, "downloading file") - async def impl() -> None: - assert dl # The function is only reached when dl is not None - async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar, is_video=False) - - await impl() + @_iorepeat(3, "downloading file") + async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: + assert dl # The function is only reached when dl is not None + async with dl as (bar, sink): + await self._stream_from_url(element.url, sink, bar, is_video=False) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: From 89be07d4d3562c75f10539c7a51c171933d3de82 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 3 Jul 2021 17:05:48 +0200 Subject: [PATCH 319/524] Use final crawl path in HTML parsing message --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a61eb4e..83cac32 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -247,13 +247,12 @@ instance's greatest bottleneck. maybe_cl = await self.crawl(path) if not maybe_cl: return None - return self._crawl_ilias_page(url, parent, path, maybe_cl) + return self._crawl_ilias_page(url, parent, maybe_cl) async def _crawl_ilias_page( self, url: str, parent: IliasPageElement, - path: PurePath, cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] @@ -267,7 +266,7 @@ instance's greatest bottleneck. while next_stage_url: soup = await self._get_page(next_stage_url) - log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain(f"URL: {next_stage_url}") page = IliasPage(soup, next_stage_url, current_parent) if next_element := page.get_next_stage_element(): From 8ec3f41251cf69a365c9009400e67d539bb4afc4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 6 Jul 2021 16:13:23 +0200 Subject: [PATCH 320/524] Crawl ilias booking objects as links --- PFERD/crawl/ilias/kit_ilias_html.py | 4 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 58 +++++++++++++++++++--- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 41f45e2..247002b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + BOOKING = "booking" MEETING = "meeting" VIDEO = "video" VIDEO_PLAYER = "video_player" @@ -490,6 +491,9 @@ class IliasPage: if str(img_tag["src"]).endswith("icon_webr.svg"): return IliasElementType.LINK + if str(img_tag["src"]).endswith("icon_book.svg"): + return IliasElementType.BOOKING + if str(img_tag["src"]).endswith("frm.svg"): return IliasElementType.FORUM diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 83cac32..a0e323b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -323,6 +323,8 @@ instance's greatest bottleneck. return None elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) + elif element.type == IliasElementType.BOOKING: + return await self._handle_booking(element, element_path) elif element.type == IliasElementType.VIDEO: return await self._handle_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: @@ -362,14 +364,56 @@ instance's greatest bottleneck. async with dl as (bar, sink): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) + self._write_link_content(link_template, real_url, element.name, element.description, sink) - content = link_template - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + def _write_link_content( + self, + link_template: str, + url: str, + name: str, + description: Optional[str], + sink: FileSink, + ) -> None: + content = link_template + content = content.replace("{{link}}", url) + content = content.replace("{{name}}", name) + content = content.replace("{{description}}", str(description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() + + async def _handle_booking( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: + log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") + log.explain(f"Links type is {self._links}") + + link_template_maybe = self._links.template() + link_extension = self._links.extension() + if not link_template_maybe or not link_extension: + log.explain("Answer: No") + return None + else: + log.explain("Answer: Yes") + element_path = element_path.with_name(element_path.name + link_extension) + + maybe_dl = await self.download(element_path, mtime=element.mtime) + if not maybe_dl: + return None + + return self._download_booking(element, link_template_maybe, maybe_dl) + + @_iorepeat(3, "resolving booking") + async def _download_booking( + self, + element: IliasPageElement, + link_template: str, + dl: DownloadToken, + ) -> None: + async with dl as (bar, sink): + self._write_link_content(link_template, element.url, element.name, element.description, sink) async def _resolve_link_target(self, export_url: str) -> str: async with self.session.get(export_url, allow_redirects=False) as resp: From ee67f9f4725be9f418d66b85bb8a749de8e5d713 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 6 Jul 2021 17:45:12 +0200 Subject: [PATCH 321/524] Sort elements by ILIAS id to ensure deterministic ordering --- PFERD/crawl/ilias/kit_ilias_html.py | 11 +++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 247002b..7e91926 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -38,6 +38,17 @@ class IliasPageElement: mtime: Optional[datetime] = None description: Optional[str] = None + def id(self) -> str: + regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + + for regex in regexes: + if match := re.search(regex, self.url): + return match.groupdict()["id"] + + # Fall back to URL + log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") + return self.url + class IliasPage: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a0e323b..cca6987 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -230,6 +230,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(PurePath("."), element): @@ -280,6 +282,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(cl.path, element): From 86f79ff1f137f6f728df08a51b12acb096e00979 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 7 Jul 2021 14:26:20 +0200 Subject: [PATCH 322/524] Update changelog --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 427219e..20dd53c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,19 @@ ambiguous situations. ## Unreleased +### Added +- `--skip` command line option +- Support for ILIAS booking objects + +### Changed +- Using multiple path segments on left side of `-name->` now results in an + error. This was already forbidden by the documentation but silently accepted + by PFERD. +- More consistent path printing in some `--explain` messages + +### Fixed +- Nondeterministic name deduplication due to ILIAS reordering elements + ## 3.1.0 - 2021-06-13 If your config file doesn't do weird things with transforms, it should continue From 544d45cbc570080964ab50044301b304343f9a31 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 13 Jul 2021 15:42:11 +0200 Subject: [PATCH 323/524] Catch non-critical exceptions at crawler top level --- CHANGELOG.md | 1 + PFERD/crawl/crawler.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20dd53c..181ef99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. ### Fixed - Nondeterministic name deduplication due to ILIAS reordering elements +- More exceptions are handled properly ## 3.1.0 - 2021-06-13 diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d798bc3..c492ee9 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -320,6 +320,7 @@ class Crawler(ABC): log.explain("Warnings or errors occurred during this run") log.explain("Answer: No") + @anoncritical async def run(self) -> None: """ Start the crawling process. Call this function if you want to use a From 742632ed8d6cebd10c7e28902afba2fccb108712 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 4 Aug 2021 18:27:26 +0000 Subject: [PATCH 324/524] Bump version to 3.2.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 181ef99..1ac3a8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.2.0 - 2021-08-04 + ### Added - `--skip` command line option - Support for ILIAS booking objects diff --git a/PFERD/version.py b/PFERD/version.py index 8ce7ae4..b8efadd 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.1.0" +VERSION = "3.2.0" From 66730773977a2602aebd5396efc1c6d8bd7b0ad7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 21 Oct 2021 12:01:41 +0200 Subject: [PATCH 325/524] Add kit-ipd crawler --- CHANGELOG.md | 1 + CONFIG.md | 7 ++ PFERD/cli/__init__.py | 1 + PFERD/cli/command_kit_ipd.py | 46 +++++++++++ PFERD/crawl/__init__.py | 3 + PFERD/crawl/kit_ipd_crawler.py | 138 +++++++++++++++++++++++++++++++++ 6 files changed, 196 insertions(+) create mode 100644 PFERD/cli/command_kit_ipd.py create mode 100644 PFERD/crawl/kit_ipd_crawler.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ac3a8d..cca4839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. ### Added - `--skip` command line option - Support for ILIAS booking objects +- A KIT IPD crawler ### Changed - Using multiple path segments on left side of `-name->` now results in an diff --git a/CONFIG.md b/CONFIG.md index 19afbd2..06b9246 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler. requests. (Default: `0.0`) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) +### The `kit-ipd` crawler + +This crawler crals a KIT ipd page by url. The root page can be crawled from +outside the KIT network so you will be informed about any new/deleted files, +but downloading files requires you to be within. Adding a show delay between +requests is likely a good idea. + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index d70ecd9..efa8f00 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -9,4 +9,5 @@ from . import command_local # noqa: F401 imported but unused from . import command_kit_ilias_web # noqa: F401 imported but unused +from . import command_kit_ipd # noqa: F401 imported but unused from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py new file mode 100644 index 0000000..480cc9b --- /dev/null +++ b/PFERD/cli/command_kit_ipd.py @@ -0,0 +1,46 @@ +import argparse +import configparser +from pathlib import Path + +from ..logging import log +from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler + +SUBPARSER = SUBPARSERS.add_parser( + "kit-ipd", + parents=[CRAWLER_PARSER], +) + +GROUP = SUBPARSER.add_argument_group( + title="kit ipd crawler arguments", + description="arguments for the 'kit-ipd' crawler", +) +GROUP.add_argument( + "target", + type=str, + metavar="TARGET", + help="url to crawl" +) +GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) + + +def load( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + log.explain("Creating config for command 'kit-ipd'") + + parser["crawl:kit-ipd"] = {} + section = parser["crawl:ipd"] + load_crawler(args, section) + + section["type"] = "kit-ipd" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + + +SUBPARSER.set_defaults(command=load) diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 7eb2fb1..1f8bd59 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -5,6 +5,7 @@ from ..auth import Authenticator from ..config import Config from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection +from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = { LocalCrawler(n, LocalCrawlerSection(s), c), "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), + "kit-ipd": lambda n, s, c, a: + KitIpdCrawler(n, KitIpdCrawlerSection(s), c), } diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py new file mode 100644 index 0000000..4d4addd --- /dev/null +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -0,0 +1,138 @@ +import os +from dataclasses import dataclass +from pathlib import PurePath +from typing import List, Set, Union +from urllib.parse import urljoin + +from bs4 import BeautifulSoup, Tag + +from ..config import Config +from ..logging import ProgressBar, log +from ..output_dir import FileSink +from ..utils import soupify +from .crawler import CrawlError +from .http_crawler import HttpCrawler, HttpCrawlerSection + + +class KitIpdCrawlerSection(HttpCrawlerSection): + def target(self) -> str: + target = self.s.get("target") + if not target: + self.missing_value("target") + + if not target.startswith("https://"): + self.invalid_value("target", target, "Should be a URL") + + return target + + +@dataclass +class KitIpdFile: + name: str + url: str + + +@dataclass +class KitIpdFolder: + name: str + files: List[KitIpdFile] + + +class KitIpdCrawler(HttpCrawler): + + def __init__( + self, + name: str, + section: KitIpdCrawlerSection, + config: Config, + ): + super().__init__(name, section, config) + self._url = section.target() + + async def _run(self) -> None: + maybe_cl = await self.crawl(PurePath(".")) + if not maybe_cl: + return + + folders: List[KitIpdFolder] = [] + + async with maybe_cl: + folder_tags = await self._fetch_folder_tags() + folders = [self._extract_folder(tag) for tag in folder_tags] + + tasks = [self._crawl_folder(folder) for folder in folders] + + await self.gather(tasks) + + async def _crawl_folder(self, folder: KitIpdFolder) -> None: + path = PurePath(folder.name) + if not await self.crawl(path): + return + + tasks = [self._download_file(path, file) for file in folder.files] + + await self.gather(tasks) + + async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + element_path = parent / file.name + maybe_dl = await self.download(element_path) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + await self._stream_from_url(file.url, sink, bar) + + async def _fetch_folder_tags(self) -> Set[Tag]: + page = await self.get_page() + elements: List[Tag] = self._find_file_links(page) + folder_tags: Set[Tag] = set() + + for element in elements: + enclosing_data: Tag = element.findParent(name="td") + label: Tag = enclosing_data.findPreviousSibling(name="td") + folder_tags.add(label) + + return folder_tags + + def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + name = folder_tag.getText().strip() + files: List[KitIpdFile] = [] + + container: Tag = folder_tag.findNextSibling(name="td") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) + + log.explain_topic(f"Found folder {name!r}") + for file in files: + log.explain(f"Found file {file.name!r}") + + return KitIpdFolder(name, files) + + def _extract_file(self, link: Tag) -> KitIpdFile: + name = link.getText().strip() + url = self._abs_url_from_link(link) + _, extension = os.path.splitext(url) + return KitIpdFile(name + extension, url) + + def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: + return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) + + def _abs_url_from_link(self, link_tag: Tag) -> str: + return urljoin(self._url, link_tag.get("href")) + + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async with self.session.get(url, allow_redirects=False) as resp: + if resp.status == 403: + raise CrawlError("Received a 403. Are you within the KIT network/VPN?") + if resp.content_length: + bar.set_total(resp.content_length) + + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) + + sink.done() + + async def get_page(self) -> BeautifulSoup: + async with self.session.get(self._url) as request: + return soupify(await request.read()) From fee12b3d9e8469d37b972f28d84a7d44538744bc Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 25 Oct 2021 17:44:12 +0000 Subject: [PATCH 326/524] Fix changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cca4839..522d96d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,12 +22,14 @@ ambiguous situations. ## Unreleased +### Added +- A KIT IPD crawler + ## 3.2.0 - 2021-08-04 ### Added - `--skip` command line option - Support for ILIAS booking objects -- A KIT IPD crawler ### Changed - Using multiple path segments on left side of `-name->` now results in an From 55ea304ff338f249914b95938675a4e9b07d0875 Mon Sep 17 00:00:00 2001 From: lukasprobst Date: Mon, 25 Oct 2021 22:32:54 +0200 Subject: [PATCH 327/524] Disable interpolation of ConfigParser --- CHANGELOG.md | 3 +++ CONFIG.md | 6 +++--- LICENSE | 2 +- PFERD/__main__.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 522d96d..a90c978 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Added - A KIT IPD crawler +### Removed +- [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file + ## 3.2.0 - 2021-08-04 ### Added diff --git a/CONFIG.md b/CONFIG.md index 06b9246..4d2ec33 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -4,11 +4,11 @@ A config file consists of sections. A section begins with a `[section]` header, which is followed by a list of `key = value` pairs. Comments must be on their own line and start with `#`. Multiline values must be indented beyond their key. Boolean values can be `yes` or `no`. For more details and some examples on the -format, see the [configparser documentation][1] ([basic interpolation][2] is -enabled). +format, see the [configparser documentation][1] ([interpolation][2] is +disabled). [1]: "Supported INI File Structure" -[2]: "BasicInterpolation" +[2]: "Interpolation of values" ## The `DEFAULT` section diff --git a/LICENSE b/LICENSE index 01f15f5..c096c4a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim +Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b665feb..bdf5b34 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -15,7 +15,7 @@ from .transformer import RuleParseError def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: log.explain_topic("Loading config") - parser = configparser.ConfigParser() + parser = configparser.ConfigParser(interpolation=None) if args.command is None: log.explain("No CLI command specified, loading config from file") From ef7d5ea2d3282e71cf0ba82698e409483cc1ad0a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:09:05 +0200 Subject: [PATCH 328/524] Allow storing crawler-specific data in reports --- PFERD/report.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/PFERD/report.py b/PFERD/report.py index 919bb35..99a4661 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -1,6 +1,6 @@ import json from pathlib import Path, PurePath -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set class ReportLoadError(Exception): @@ -67,6 +67,7 @@ class Report: self.deleted_files: Set[PurePath] = set() # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() + self.custom: Dict[str, Any] = dict() @staticmethod def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: @@ -81,6 +82,15 @@ class Report: return result + @staticmethod + def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: + result: Dict[str, Any] = data.get(key, {}) + + if not isinstance(result, dict): + raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") + + return result + @classmethod def load(cls, path: Path) -> "Report": """ @@ -108,6 +118,7 @@ class Report: self.delete_file(PurePath(elem)) for elem in self._get_list_of_strs(data, "not_deleted"): self.not_delete_file(PurePath(elem)) + self.custom = self._get_str_dictionary(data, "custom") return self @@ -124,6 +135,7 @@ class Report: "changed": [str(path) for path in sorted(self.changed_files)], "deleted": [str(path) for path in sorted(self.deleted_files)], "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], + "custom": self.custom } with open(path, "w") as f: @@ -190,3 +202,15 @@ class Report: """ self.not_deleted_files.add(path) + + def add_custom_value(self, key: str, value: Any) -> None: + """ + Adds a custom value under the passed key, overwriting any existing + """ + self.custom[key] = value + + def get_custom_value(self, key: str) -> Optional[Any]: + """ + Retrieves a custom value for the given key. + """ + return self.custom.get(key) From f9a3f9b9f2702796f64d11d5d649261ea76a908d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:12:29 +0200 Subject: [PATCH 329/524] Handle multi-stream videos --- PFERD/crawl/ilias/kit_ilias_html.py | 18 ++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 86 +++++++++++++++++++--- 2 files changed, 92 insertions(+), 12 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7e91926..78ae084 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -133,9 +133,21 @@ class IliasPage: # parse it json_object = json.loads(json_str) - # and fetch the video url! - video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + streams = [stream for stream in json_object["streams"] if stream["type"] == "video"] + + # and just fetch the lone video url! + if len(streams) == 1: + video_url = streams[0]["sources"]["mp4"][0]["src"] + return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + + log.explain(f"Found multiple videos for stream at {self._source_name}") + items = [] + for stream in sorted(streams, key=lambda stream: stream["content"]): + full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" + video_url = stream["sources"]["mp4"][0]["src"] + items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + + return items def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index cca6987..f483754 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,7 +1,7 @@ import asyncio import re from pathlib import PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union +from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast import aiohttp from aiohttp import hdrs @@ -439,22 +439,90 @@ instance's greatest bottleneck. element: IliasPageElement, element_path: PurePath, ) -> Optional[Awaitable[None]]: - # Videos will NOT be redownloaded - their content doesn't really change and they are chunky - maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) - if not maybe_dl: + # Copy old mapping as it is likely still relevant + if self.prev_report: + self.report.add_custom_value( + str(element_path), + self.prev_report.get_custom_value(str(element_path)) + ) + + # A video might contain other videos, so let's "crawl" the video first + # to ensure rate limits apply. This must be a download as *this token* + # is re-used if the video consists of a single stream. In that case the + # file name is used and *not* the stream name the ilias html parser reported + # to ensure backwards compatibility. + maybe_dl = await self.download(element_path, redownload=Redownload.ALWAYS) + + # If we do not want to crawl it (user filter) or we have every file + # from the cached mapping already, we can ignore this and bail + if not maybe_dl or self._all_videos_locally_present(element_path): + # Mark all existing cideos as known so they do not get deleted + # during dleanup. We "downloaded" them, just without actually making + # a network request as we assumed they did not change. + for video in self._previous_contained_videos(element_path): + await self.download(video) + return None - return self._download_video(element, maybe_dl) + return self._download_video(element_path, element, maybe_dl) + + def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]: + if not self.prev_report: + return [] + custom_value = self.prev_report.get_custom_value(str(video_path)) + if not custom_value: + return [] + names = cast(List[str], custom_value) + folder = video_path.parent + return [PurePath(folder, name) for name in names] + + def _all_videos_locally_present(self, video_path: PurePath) -> bool: + if contained_videos := self._previous_contained_videos(video_path): + log.explain_topic(f"Checking local cache for video {video_path.name}") + all_found_locally = True + for video in contained_videos: + all_found_locally = all_found_locally and self._output_dir.resolve(video).exists() + if all_found_locally: + log.explain("Found all videos locally, skipping enumeration request") + return True + log.explain("Missing at least one video, continuing with requests!") + return False @_iorepeat(3, "downloading video") - async def _download_video(self, element: IliasPageElement, dl: DownloadToken) -> None: + async def _download_video( + self, + original_path: PurePath, + element: IliasPageElement, + dl: DownloadToken + ) -> None: + stream_elements: List[IliasPageElement] = [] async with dl as (bar, sink): page = IliasPage(await self._get_page(element.url), element.url, element) - real_element = page.get_child_elements()[0] + stream_elements = page.get_child_elements() - log.explain(f"Streaming video from real url {real_element.url}") + if len(stream_elements) > 1: + log.explain(f"Found multiple video streams for {element.name}") + else: + log.explain(f"Using single video mode for {element.name}") + stream_element = stream_elements[0] + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + self.report.add_custom_value(str(original_path), [original_path.name]) + return - await self._stream_from_url(real_element.url, sink, bar, is_video=True) + contained_video_paths: List[str] = [] + + for stream_element in stream_elements: + contained_video_paths.append(stream_element.name) + video_path = original_path.parent / stream_element.name + + maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) + if not maybe_dl: + continue + async with maybe_dl as (bar, sink): + log.explain(f"Streaming video from real url {stream_element.url}") + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + + self.report.add_custom_value(str(original_path), contained_video_paths) async def _handle_file( self, From e42ab83d32ce852eb26e1a21982399e2988e769a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 25 Oct 2021 11:07:25 +0200 Subject: [PATCH 330/524] Add support for ILIAS cards --- PFERD/crawl/ilias/kit_ilias_html.py | 94 ++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 78ae084..d8c347d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -368,6 +368,8 @@ class IliasPage: log.explain(f"Found {element_name!r}") result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) + result += self._find_cards() + return result def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: @@ -450,6 +452,90 @@ class IliasPage: log.explain(f"Found file {full_path!r}") return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) + def _find_cards(self) -> List[IliasPageElement]: + result: List[IliasPageElement] = [] + + card_titles: List[Tag] = self._soup.select(".card-title a") + + for title in card_titles: + url = self._abs_url_from_link(title) + name = _sanitize_path_name(title.getText().strip()) + type = self._find_type_from_card(title) + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {title}") + continue + + result.append(IliasPageElement(type, url, name)) + + card_button_tiles: List[Tag] = self._soup.select(".card-title button") + + for button in card_button_tiles: + regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") + res = regex.search(str(self._soup)) + if not res: + _unexpected_html_warning() + log.warn_contd(f"Could not find click handler target for {button}") + continue + url = self._abs_url_from_relative(res.group(1)) + name = _sanitize_path_name(button.getText().strip()) + type = self._find_type_from_card(button) + caption_parent = button.findParent( + "div", + attrs={"class": lambda x: x and "caption" in x}, + ) + description = caption_parent.find_next_sibling("div").getText().strip() + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {button}") + continue + + result.append(IliasPageElement(type, url, name, description=description)) + + return result + + def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: + def is_card_root(element: Tag) -> bool: + return "il-card" in element["class"] and "thumbnail" in element["class"] + + card_root: Optional[Tag] = None + + # We look for the card root + for parent in card_title.parents: + if is_card_root(parent): + card_root = parent + break + + if card_root is None: + _unexpected_html_warning() + log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}") + return None + + icon: Tag = card_root.select_one(".il-card-repository-head .icon") + + if "opencast" in icon["class"]: + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED + if "exc" in icon["class"]: + return IliasElementType.EXERCISE + if "webr" in icon["class"]: + return IliasElementType.LINK + if "book" in icon["class"]: + return IliasElementType.BOOKING + if "frm" in icon["class"]: + return IliasElementType.FORUM + if "sess" in icon["class"]: + return IliasElementType.MEETING + if "tst" in icon["class"]: + return IliasElementType.TEST + if "fold" in icon["class"]: + return IliasElementType.FOLDER + + _unexpected_html_warning() + log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") + return None + @staticmethod def _find_type_from_link( element_name: str, @@ -550,7 +636,13 @@ class IliasPage: """ Create an absolute url from an tag. """ - return urljoin(self._page_url, link_tag.get("href")) + return self._abs_url_from_relative(link_tag.get("href")) + + def _abs_url_from_relative(self, relative_url: str) -> str: + """ + Create an absolute url from a relative URL. + """ + return urljoin(self._page_url, relative_url) def _unexpected_html_warning() -> None: From ad3f4955f72a6bfbdcbaaae24b821f078e6e44d5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:14:39 +0200 Subject: [PATCH 331/524] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a90c978..faa2507 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ ambiguous situations. ### Added - A KIT IPD crawler +- Support for ILIAS cards +- Support for multi-stream videos ### Removed - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file From d6f38a61e16fa95d8a2365abc1cfd70f35ee0289 Mon Sep 17 00:00:00 2001 From: Toorero <22551563+Toorero@users.noreply.github.com> Date: Mon, 25 Oct 2021 21:34:51 +0200 Subject: [PATCH 332/524] Fixed minor spelling mistakes --- CONFIG.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 4d2ec33..8ccaa50 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -36,7 +36,7 @@ Sections whose names start with `crawl:` are used to configure crawlers. The rest of the section name specifies the name of the crawler. A crawler synchronizes a remote resource to a local directory. There are -different types of crawlers for different kinds of resources, e. g. ILIAS +different types of crawlers for different kinds of resources, e.g. ILIAS courses or lecture websites. Each crawl section represents an instance of a specific type of crawler. The @@ -53,7 +53,7 @@ common to all crawlers: crawler can still be executed manually using the `--crawler` or `-C` flags. (Default: `no`) - `output_dir`: The directory the crawler synchronizes files to. A crawler will - never place any files outside of this directory. (Default: the crawler's name) + never place any files outside this directory. (Default: the crawler's name) - `redownload`: When to download a file that is already present locally. (Default: `never-smart`) - `never`: If a file is present locally, it is not downloaded again. @@ -138,7 +138,7 @@ crawler simulate a slower, network-based crawler. ### The `kit-ipd` crawler -This crawler crals a KIT ipd page by url. The root page can be crawled from +This crawler crawls a KIT ipd page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, but downloading files requires you to be within. Adding a show delay between requests is likely a good idea. @@ -312,11 +312,11 @@ matches `SOURCE`, the output path is created using `TARGET` as template. `SOURCE` is automatically anchored. `TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can -be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. +be referred to as `{g}` (e.g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is -available as `{i}` (e. g. `{i3}`). If capturing group *n*'s contents are a -valid float, the float value is available as `{f}` (e. g. `{f3}`). If a -capturing group is not present (e. g. when matching the string `cd` with the +available as `{i}` (e.g. `{i3}`). If capturing group *n*'s contents are a +valid float, the float value is available as `{f}` (e.g. `{f3}`). If a +capturing group is not present (e.g. when matching the string `cd` with the regex `(ab)?cd`), the corresponding variables are not defined. Python's format string syntax has rich options for formatting its arguments. For From 6b2a65757373193a5ecb8d2263ae7d758178014d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julius=20R=C3=BCberg?= <22551563+Toorero@users.noreply.github.com> Date: Mon, 1 Nov 2021 10:09:50 +0100 Subject: [PATCH 333/524] Fix IPD crawler for different subpages (#42) This patch reworks the IPD crawler to support subpages which do not use "/intern" for links and fetches the folder names from table headings. --- PFERD/crawl/kit_ipd_crawler.py | 50 ++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 4d4addd..1ed5ffe 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,7 +1,9 @@ import os +import re from dataclasses import dataclass from pathlib import PurePath -from typing import List, Set, Union +from re import Pattern +from typing import List, Set, Union, AnyStr, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -25,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target + def link_regex(self) -> Pattern[AnyStr]: + regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$") + return re.compile(regex) + @dataclass class KitIpdFile: @@ -48,6 +54,7 @@ class KitIpdCrawler(HttpCrawler): ): super().__init__(name, section, config) self._url = section.target() + self._file_regex = section.link_regex() async def _run(self) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -88,19 +95,28 @@ class KitIpdCrawler(HttpCrawler): folder_tags: Set[Tag] = set() for element in elements: - enclosing_data: Tag = element.findParent(name="td") - label: Tag = enclosing_data.findPreviousSibling(name="td") - folder_tags.add(label) + folder_label = self._fetch_folder_label(element) + if folder_label is None: + folder_tags.add(page) + else: + folder_tags.add(folder_label) return folder_tags def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: - name = folder_tag.getText().strip() files: List[KitIpdFile] = [] + # if files have found outside a regular table + if not folder_tag.name.startswith("h"): + name = "." + root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag)) + for link in root_links: + files.append(self._extract_file(link)) - container: Tag = folder_tag.findNextSibling(name="td") - for link in self._find_file_links(container): - files.append(self._extract_file(link)) + else: + name = folder_tag.getText().strip() + container: Tag = folder_tag.findNextSibling(name="table") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) log.explain_topic(f"Found folder {name!r}") for file in files: @@ -108,14 +124,24 @@ class KitIpdCrawler(HttpCrawler): return KitIpdFolder(name, files) + @staticmethod + def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: + enclosing_table: Tag = file_link.findParent(name="table") + if enclosing_table is None: + return None + label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) + if label is None: + return None + else: + return label + def _extract_file(self, link: Tag) -> KitIpdFile: - name = link.getText().strip() url = self._abs_url_from_link(link) - _, extension = os.path.splitext(url) - return KitIpdFile(name + extension, url) + name = os.path.basename(url) + return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: - return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) + return tag.findAll(name="a", attrs={"href": self._file_regex}) def _abs_url_from_link(self, link_tag: Tag) -> str: return urljoin(self._url, link_tag.get("href")) From 88afe64a928fce7108264f386298edbbe60117f5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 1 Nov 2021 10:43:13 +0100 Subject: [PATCH 334/524] Refactor IPD crawler a bit --- PFERD/cli/command_kit_ipd.py | 2 +- PFERD/crawl/kit_ipd_crawler.py | 75 +++++++++++++++++----------------- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index 480cc9b..c4c593f 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -35,7 +35,7 @@ def load( log.explain("Creating config for command 'kit-ipd'") parser["crawl:kit-ipd"] = {} - section = parser["crawl:ipd"] + section = parser["crawl:kit-ipd"] load_crawler(args, section) section["type"] = "kit-ipd" diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 1ed5ffe..76145b4 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from pathlib import PurePath from re import Pattern -from typing import List, Set, Union, AnyStr, Optional +from typing import Awaitable, List, Optional, Set, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -27,12 +27,12 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target - def link_regex(self) -> Pattern[AnyStr]: - regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$") + def link_regex(self) -> Pattern[str]: + regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$") return re.compile(regex) -@dataclass +@dataclass(unsafe_hash=True) class KitIpdFile: name: str url: str @@ -43,6 +43,14 @@ class KitIpdFolder: name: str files: List[KitIpdFile] + def explain(self) -> None: + log.explain_topic(f"Folder {self.name!r}") + for file in self.files: + log.explain(f"File {file.name!r}") + + def __hash__(self) -> int: + return self.name.__hash__() + class KitIpdCrawler(HttpCrawler): @@ -61,13 +69,15 @@ class KitIpdCrawler(HttpCrawler): if not maybe_cl: return - folders: List[KitIpdFolder] = [] + tasks: List[Awaitable[None]] = [] async with maybe_cl: - folder_tags = await self._fetch_folder_tags() - folders = [self._extract_folder(tag) for tag in folder_tags] - - tasks = [self._crawl_folder(folder) for folder in folders] + for item in await self._fetch_items(): + if isinstance(item, KitIpdFolder): + tasks.append(self._crawl_folder(item)) + else: + # Orphan files are placed in the root folder + tasks.append(self._download_file(PurePath("."), item)) await self.gather(tasks) @@ -89,51 +99,42 @@ class KitIpdCrawler(HttpCrawler): async with maybe_dl as (bar, sink): await self._stream_from_url(file.url, sink, bar) - async def _fetch_folder_tags(self) -> Set[Tag]: + async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: page = await self.get_page() elements: List[Tag] = self._find_file_links(page) - folder_tags: Set[Tag] = set() + items: Set[Union[KitIpdFile, KitIpdFolder]] = set() for element in elements: - folder_label = self._fetch_folder_label(element) - if folder_label is None: - folder_tags.add(page) + folder_label = self._find_folder_label(element) + if folder_label: + folder = self._extract_folder(folder_label) + if folder not in items: + items.add(folder) + folder.explain() else: - folder_tags.add(folder_label) + file = self._extract_file(element) + items.add(file) + log.explain_topic(f"Orphan file {file.name!r}") + log.explain("Attributing it to root folder") - return folder_tags + return items def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: files: List[KitIpdFile] = [] - # if files have found outside a regular table - if not folder_tag.name.startswith("h"): - name = "." - root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag)) - for link in root_links: - files.append(self._extract_file(link)) + name = folder_tag.getText().strip() - else: - name = folder_tag.getText().strip() - container: Tag = folder_tag.findNextSibling(name="table") - for link in self._find_file_links(container): - files.append(self._extract_file(link)) - - log.explain_topic(f"Found folder {name!r}") - for file in files: - log.explain(f"Found file {file.name!r}") + container: Tag = folder_tag.findNextSibling(name="table") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) return KitIpdFolder(name, files) @staticmethod - def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: + def _find_folder_label(file_link: Tag) -> Optional[Tag]: enclosing_table: Tag = file_link.findParent(name="table") if enclosing_table is None: return None - label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) - if label is None: - return None - else: - return label + return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) def _extract_file(self, link: Tag) -> KitIpdFile: url = self._abs_url_from_link(link) From 13b8c3d9c6c59ab2714e2670506d89c5a2cb6eb6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 2 Nov 2021 09:30:46 +0100 Subject: [PATCH 335/524] Add regex option to config and CLI parser --- CONFIG.md | 7 ++++++- LICENSE | 3 ++- PFERD/cli/command_kit_ipd.py | 8 ++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 8ccaa50..569780d 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -138,11 +138,16 @@ crawler simulate a slower, network-based crawler. ### The `kit-ipd` crawler -This crawler crawls a KIT ipd page by url. The root page can be crawled from +This crawler crawls a KIT-IPD page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, but downloading files requires you to be within. Adding a show delay between requests is likely a good idea. +- `target`: URL to a KIT-IPD page +- `link_regex`: A regex that is matched against the `href` part of links. If it + matches, the given link is downloaded as a file. This is used to extract + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/LICENSE b/LICENSE index c096c4a..fe2293f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ -Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst +Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, + TheChristophe, Scriptim, thelukasprobst, Toorero Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index c4c593f..b53e67e 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -14,6 +14,12 @@ GROUP = SUBPARSER.add_argument_group( title="kit ipd crawler arguments", description="arguments for the 'kit-ipd' crawler", ) +GROUP.add_argument( + "--link-regex", + type=str, + metavar="REGEX", + help="href-matching regex to identify downloadable files" +) GROUP.add_argument( "target", type=str, @@ -41,6 +47,8 @@ def load( section["type"] = "kit-ipd" section["target"] = str(args.target) section["output_dir"] = str(args.output) + if args.link_regex: + section["link_regex"] = str(args.link_regex) SUBPARSER.set_defaults(command=load) From 6289938d7c772660a5d497ce456168186eb8a6fb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Nov 2021 12:09:51 +0100 Subject: [PATCH 336/524] Do not stop crawling files when encountering a CrawlWarning --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index f483754..c3e51ef 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -84,7 +84,7 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) -def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: +def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: last_exception: Optional[BaseException] = None @@ -105,7 +105,10 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: if last_exception: message = f"Error in I/O Operation: {last_exception}" - raise CrawlWarning(message) from last_exception + if failure_is_error: + raise CrawlError(message) from last_exception + else: + raise CrawlWarning(message) from last_exception raise CrawlError("Impossible return in ilias _iorepeat") return wrapper # type: ignore @@ -251,6 +254,7 @@ instance's greatest bottleneck. return None return self._crawl_ilias_page(url, parent, maybe_cl) + @anoncritical async def _crawl_ilias_page( self, url: str, @@ -292,10 +296,12 @@ instance's greatest bottleneck. # And execute them await self.gather(tasks) + # These decorators only apply *to this method* and *NOT* to the returned + # awaitables! + # This method does not await the handlers but returns them instead. + # This ensures one level is handled at a time and name deduplication + # works correctly. @anoncritical - # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical. - # If that happens we will be terminated as anoncritical doesn't tream them as non-critical. - @_wrap_io_in_warning("handling ilias element") async def _handle_ilias_element( self, parent_path: PurePath, @@ -363,6 +369,7 @@ instance's greatest bottleneck. return self._download_link(element, link_template_maybe, maybe_dl) + @anoncritical @_iorepeat(3, "resolving link") async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async with dl as (bar, sink): @@ -409,6 +416,7 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking( self, @@ -488,6 +496,7 @@ instance's greatest bottleneck. log.explain("Missing at least one video, continuing with requests!") return False + @anoncritical @_iorepeat(3, "downloading video") async def _download_video( self, @@ -534,6 +543,7 @@ instance's greatest bottleneck. return None return self._download_file(element, maybe_dl) + @anoncritical @_iorepeat(3, "downloading file") async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: assert dl # The function is only reached when dl is not None @@ -589,7 +599,7 @@ instance's greatest bottleneck. # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login") + @_iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) From 90cb6e989b492bbfe2f242c77aad616b86637052 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Nov 2021 23:20:24 +0100 Subject: [PATCH 337/524] Do not download single videos if cache does not exist --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c3e51ef..c6115f4 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -514,7 +514,12 @@ instance's greatest bottleneck. else: log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] - await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + + # We do not have a local cache yet + if self._output_dir.resolve(original_path).exists(): + log.explain(f"Video for {element.name} existed locally") + else: + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) self.report.add_custom_value(str(original_path), [original_path.name]) return From a82a0b19c2193c6817ae07361889de8fd392868e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 7 Nov 2021 21:40:22 +0100 Subject: [PATCH 338/524] Collect crawler warnings/errors and include them in the report --- PFERD/crawl/crawler.py | 8 ++++++-- PFERD/pferd.py | 8 ++++++++ PFERD/report.py | 24 +++++++++++++++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index c492ee9..53f43e9 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -47,10 +47,12 @@ def noncritical(f: Wrapped) -> Wrapped: try: f(*args, **kwargs) except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: + crawler.report.add_warning(str(e)) log.warn(str(e)) crawler.error_free = False - except: # noqa: E722 do not use bare 'except' + except Exception as e: crawler.error_free = False + crawler.report.add_error(str(e)) raise return wrapper # type: ignore @@ -83,8 +85,10 @@ def anoncritical(f: AWrapped) -> AWrapped: except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: log.warn(str(e)) crawler.error_free = False - except: # noqa: E722 do not use bare 'except' + crawler.report.add_warning(str(e)) + except Exception as e: crawler.error_free = False + crawler.report.add_error(str(e)) raise return None diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 726ed45..079053b 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -182,5 +182,13 @@ class Pferd: something_changed = True log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + for warning in crawler.report.encountered_warnings: + something_changed = True + log.report(f" [bold bright_red]Warning[/] {warning}") + + for error in crawler.report.encountered_errors: + something_changed = True + log.report(f" [bold bright_red]Error[/] {error}") + if not something_changed: log.report(" Nothing changed") diff --git a/PFERD/report.py b/PFERD/report.py index 99a4661..0e0c789 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -67,8 +67,14 @@ class Report: self.deleted_files: Set[PurePath] = set() # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() + + # Custom crawler-specific data self.custom: Dict[str, Any] = dict() + # Encountered errors and warnings + self.encountered_warnings: List[str] = [] + self.encountered_errors: List[str] = [] + @staticmethod def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: result: Any = data.get(key, []) @@ -119,6 +125,8 @@ class Report: for elem in self._get_list_of_strs(data, "not_deleted"): self.not_delete_file(PurePath(elem)) self.custom = self._get_str_dictionary(data, "custom") + self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") + self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") return self @@ -135,7 +143,9 @@ class Report: "changed": [str(path) for path in sorted(self.changed_files)], "deleted": [str(path) for path in sorted(self.deleted_files)], "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], - "custom": self.custom + "custom": self.custom, + "encountered_warnings": self.encountered_warnings, + "encountered_errors": self.encountered_errors, } with open(path, "w") as f: @@ -214,3 +224,15 @@ class Report: Retrieves a custom value for the given key. """ return self.custom.get(key) + + def add_error(self, error: str) -> None: + """ + Adds an error to this report's error list. + """ + self.encountered_errors.append(error) + + def add_warning(self, warning: str) -> None: + """ + Adds a warning to this report's warning list. + """ + self.encountered_warnings.append(warning) From eac2e341612461987d37314110c3f4c7640499f3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 7 Jan 2022 23:32:31 +0100 Subject: [PATCH 339/524] Fix is_logged_in for ILIAS 7 --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c6115f4..c5b2953 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -611,9 +611,10 @@ instance's greatest bottleneck. @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages - userlog = soup.find("li", {"id": "userlog"}) - if userlog is not None: - return True + mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + if mainbar is not None: + login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) + return not login_button # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find( From a99356f2a2d403ffb40f47bb159707d73e55a0e3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 00:27:34 +0100 Subject: [PATCH 340/524] Fix video stream extraction --- PFERD/crawl/ilias/kit_ilias_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d8c347d..ece88c5 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -133,7 +133,7 @@ class IliasPage: # parse it json_object = json.loads(json_str) - streams = [stream for stream in json_object["streams"] if stream["type"] == "video"] + streams = [stream for stream in json_object["streams"]] # and just fetch the lone video url! if len(streams) == 1: From 462d993fbc00602b4952d675fa4c77e5372c27fa Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 00:27:48 +0100 Subject: [PATCH 341/524] Fix local video path cache (hopefully) --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c5b2953..5d44566 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -489,7 +489,10 @@ instance's greatest bottleneck. log.explain_topic(f"Checking local cache for video {video_path.name}") all_found_locally = True for video in contained_videos: - all_found_locally = all_found_locally and self._output_dir.resolve(video).exists() + transformed_path = self._transformer.transform(video) + if transformed_path: + exists_locally = self._output_dir.resolve(transformed_path).exists() + all_found_locally = all_found_locally and exists_locally if all_found_locally: log.explain("Found all videos locally, skipping enumeration request") return True @@ -515,8 +518,12 @@ instance's greatest bottleneck. log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] + transformed_path = self._transformer.transform(original_path) + if not transformed_path: + raise CrawlError(f"Download returned a path but transform did not for {original_path}") + # We do not have a local cache yet - if self._output_dir.resolve(original_path).exists(): + if self._output_dir.resolve(transformed_path).exists(): log.explain(f"Video for {element.name} existed locally") else: await self._stream_from_url(stream_element.url, sink, bar, is_video=True) @@ -526,8 +533,8 @@ instance's greatest bottleneck. contained_video_paths: List[str] = [] for stream_element in stream_elements: - contained_video_paths.append(stream_element.name) video_path = original_path.parent / stream_element.name + contained_video_paths.append(str(video_path)) maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) if not maybe_dl: From 6f3cfd43969cdac557c4f2d38bd2b4f0ffd40721 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 16:58:15 +0100 Subject: [PATCH 342/524] Fix personal desktop crawling --- PFERD/crawl/ilias/kit_ilias_html.py | 61 ++++++++++++++++++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ece88c5..9c8ab95 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -39,7 +39,12 @@ class IliasPageElement: description: Optional[str] = None def id(self) -> str: - regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + regexes = [ + r"eid=(?P[0-9a-z\-]+)", + r"file_(?P\d+)", + r"ref_id=(?P\d+)", + r"target=[a-z]+_(?P\d+)" + ] for regex in regexes: if match := re.search(regex, self.url): @@ -71,6 +76,9 @@ class IliasPage: if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() + if self._is_personal_desktop(): + log.explain("Page is the personal desktop") + return self._find_personal_desktop_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -115,6 +123,9 @@ class IliasPage: return False + def _is_personal_desktop(self) -> bool: + return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -149,6 +160,26 @@ class IliasPage: return items + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + + titles: List[Tag] = self._soup.select(".il-item-title") + for title in titles: + link = title.find("a") + name = _sanitize_path_name(link.text.strip()) + url = self._abs_url_from_link(link) + + type = self._find_type_from_link(name, link, url) + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {link}") + continue + + log.explain(f"Found {name!r}") + items.append(IliasPageElement(type, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing @@ -551,9 +582,30 @@ class IliasPage: if "target=file_" in parsed_url.query: return IliasElementType.FILE + if "target=grp_" in parsed_url.query: + return IliasElementType.FOLDER + + if "target=crs_" in parsed_url.query: + return IliasElementType.FOLDER + + if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: + return IliasElementType.EXERCISE + + if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: + return IliasElementType.LINK + + if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: + return IliasElementType.FORUM + + if "cmdClass=ilobjtestgui" in parsed_url.query: + return IliasElementType.TEST + + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so + # try to guess it from the image. + # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc - if "ref_id=" in parsed_url.query: + if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: return IliasPage._find_type_from_folder_like(link_element, url) _unexpected_html_warning() @@ -574,7 +626,7 @@ class IliasPage: # We look for the outer div of our inner link, to find information around it # (mostly the icon) for parent in link_element.parents: - if "ilContainerListItemOuter" in parent["class"]: + if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]: found_parent = parent break @@ -586,6 +638,9 @@ class IliasPage: # Find the small descriptive icon to figure out the type img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") + if img_tag is None: + img_tag = found_parent.select_one("img.icon") + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5d44566..99d6cf6 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -203,7 +203,9 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - await self._crawl_url(self._base_url) + appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" + appendix = appendix.encode("ASCII").hex() + await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -622,6 +624,11 @@ instance's greatest bottleneck. if mainbar is not None: login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) return not login_button + + # Personal Desktop + if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + return True + # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find( From ced8b9a2d032e7e4956b331d4408cb4b0829c780 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 16:58:30 +0100 Subject: [PATCH 343/524] Fix some accordions --- PFERD/crawl/ilias/kit_ilias_html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9c8ab95..0a81222 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -428,7 +428,10 @@ class IliasPage: continue prev: Tag = parent.findPreviousSibling("div") if "ilContainerBlockHeader" in prev.get("class"): - found_titles.append(prev.find("h3").getText().strip()) + if prev.find("h3"): + found_titles.append(prev.find("h3").getText().strip()) + else: + found_titles.append(prev.find("h2").getText().strip()) # And this for real accordions if "il_VAccordionContentDef" in parent.get("class"): From 5f527bc697b58512520f4d8ff93b856ff3a345b1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 17:14:40 +0100 Subject: [PATCH 344/524] Remove Python 3.9 Pattern typehints --- PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/kit_ipd_crawler.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 0a81222..78bedbf 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -132,7 +132,7 @@ class IliasPage: # on the page, but defined in a JS object inside a script tag, passed to the player # library. # We do the impossible and RegEx the stream JSON object out of the page's HTML source - regex: re.Pattern[str] = re.compile( + regex = re.compile( r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE ) json_match = regex.search(str(self._soup)) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 76145b4..1a5314b 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,8 +2,7 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from re import Pattern -from typing import Awaitable, List, Optional, Set, Union +from typing import Awaitable, List, Optional, Pattern, Set, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag From e32c1f000fb9abcc47f8dc127b4d674acfa1662c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 18:05:48 +0100 Subject: [PATCH 345/524] Fix mtime for single streams --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 99d6cf6..c4e70c0 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -461,7 +461,7 @@ instance's greatest bottleneck. # is re-used if the video consists of a single stream. In that case the # file name is used and *not* the stream name the ilias html parser reported # to ensure backwards compatibility. - maybe_dl = await self.download(element_path, redownload=Redownload.ALWAYS) + maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) # If we do not want to crawl it (user filter) or we have every file # from the cached mapping already, we can ignore this and bail From eb4de8ae0cc37e38e9fa801f729e68d1f71a0bb0 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 18:14:43 +0100 Subject: [PATCH 346/524] Ignore 1970 dates as windows crashes when calling .timestamp() --- PFERD/output_dir.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 0fb9911..e612267 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -231,7 +231,8 @@ class OutputDirectory: stat = local_path.stat() remote_newer = None - if mtime := heuristics.mtime: + if heuristics.mtime and heuristics.mtime.year > 1970: + mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: log.explain("Remote file seems to be newer") From 43c5453e100aedede844a242721d2990845c2c26 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 19:59:42 +0100 Subject: [PATCH 347/524] Correctly crawl files on desktop The files on the desktop do not include a download link, so we need to rewrite it. --- PFERD/crawl/ilias/kit_ilias_html.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 78bedbf..cee0555 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -176,6 +176,11 @@ class IliasPage: continue log.explain(f"Found {name!r}") + + if type == IliasElementType.FILE and "_download" not in url: + url = re.sub(r"(target=file_\d+)", r"\1_download", url) + log.explain("Rewired file URL to include download part") + items.append(IliasPageElement(type, url, name)) return items From 10d9d7452809aafe4f406f894944a078072f16bf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:28:30 +0100 Subject: [PATCH 348/524] Bail out when crawling recursive courses --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c4e70c0..8f78e7a 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -182,6 +182,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._visited_urls: Set[str] = set() async def _run(self) -> None: if isinstance(self._target, int): @@ -309,6 +310,12 @@ instance's greatest bottleneck. parent_path: PurePath, element: IliasPageElement, ) -> Optional[Awaitable[None]]: + if element.url in self._visited_urls: + raise CrawlWarning( + f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + ) + self._visited_urls.add(element.url) + element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: From d30f25ee9788d3363544ba9779cabf157dba3b98 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:28:45 +0100 Subject: [PATCH 349/524] Detect shib login page as login page And do not assume we are logged in... --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 8f78e7a..c3b2342 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -630,7 +630,8 @@ instance's greatest bottleneck. mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) - return not login_button + shib_login = soup.find(id="button_shib_login") + return not login_button and not shib_login # Personal Desktop if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): From 4ee919625da8d3d04cbb889e24d05b1c09436fe8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:47:35 +0100 Subject: [PATCH 350/524] Add rudimentary support for content pages --- PFERD/crawl/ilias/kit_ilias_html.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index cee0555..754af16 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -77,8 +77,11 @@ class IliasPage: log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() if self._is_personal_desktop(): - log.explain("Page is the personal desktop") + log.explain("Page is the personal desktop, searching for elements") return self._find_personal_desktop_entries() + if self._is_content_page(): + log.explain("Page is a content page, searching for elements") + return self._find_copa_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -126,6 +129,12 @@ class IliasPage: def _is_personal_desktop(self) -> bool: return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _is_content_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=copa_" in link.get("value") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -185,6 +194,23 @@ class IliasPage: return items + def _find_copa_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") + + for link in links: + url = self._abs_url_from_link(link) + name = _sanitize_path_name(link.getText().strip().replace("\t", "")) + + if "file_id" not in url: + _unexpected_html_warning() + log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") + continue + + items.append(IliasPageElement(IliasElementType.FILE, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing From 4bf0c972e6e37afc7f9688104082189f5f78d390 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 11:47:59 +0100 Subject: [PATCH 351/524] Update types for rich 11 --- PFERD/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index 32e5268..e2d64fc 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -5,7 +5,7 @@ from contextlib import asynccontextmanager, contextmanager # TODO In Python 3.9 and above, ContextManager is deprecated from typing import AsyncIterator, ContextManager, Iterator, List, Optional -from rich.console import Console, RenderGroup +from rich.console import Console, Group from rich.live import Live from rich.markup import escape from rich.panel import Panel @@ -68,7 +68,7 @@ class Log: if self._download_progress.task_ids: elements.append(self._download_progress) - group = RenderGroup(*elements) # type: ignore + group = Group(*elements) # type: ignore self._live.update(group) @contextmanager From e9d2d0503001728f6c1f313982d8843d83405e3d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 11:39:42 +0100 Subject: [PATCH 352/524] Update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index faa2507..1b392c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,10 +26,16 @@ ambiguous situations. - A KIT IPD crawler - Support for ILIAS cards - Support for multi-stream videos +- Support for ILIAS 7 ### Removed - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file +### Fixed +- Crawling of recursive courses +- Crawling files directly placed on the personal desktop +- Ignore timestamps at the unix epoch as they crash on windows + ## 3.2.0 - 2021-08-04 ### Added From e467b38d739347d62cbb122d9f4752abe823b423 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:23:00 +0100 Subject: [PATCH 353/524] Only reject 1970 timestamps on windows --- PFERD/output_dir.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index e612267..441717b 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -231,7 +231,9 @@ class OutputDirectory: stat = local_path.stat() remote_newer = None - if heuristics.mtime and heuristics.mtime.year > 1970: + + # Python on Windows crashes when faced with timestamps around the unix epoch + if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: From 33453ede2d63b15bcca2ce541af2299440bfa8ff Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:31:42 +0100 Subject: [PATCH 354/524] Update dependency versions in setup.py --- setup.cfg | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5758282..059798a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,11 +6,11 @@ version = attr: PFERD.version.VERSION packages = find: python_requires = >=3.8 install_requires = - aiohttp>=3.7.4.post0 - beautifulsoup4>=4.9.3 - rich>=10.1.0 - keyring>=23.0.1 - certifi>=2020.12.5 + aiohttp>=3.8.1 + beautifulsoup4>=4.10.0 + rich>=11.0.0 + keyring>=23.5.0 + certifi>=2021.10.8 [options.entry_points] console_scripts = From 9618aae83bf10b8e517c53a53c47d14dd707c707 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:32:58 +0100 Subject: [PATCH 355/524] Add content pages to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b392c1..6e4c7e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - A KIT IPD crawler - Support for ILIAS cards +- (Rudimentary) support for content pages - Support for multi-stream videos - Support for ILIAS 7 From 0045124a4e2851d4d1d84bc7c2b68c75f49d5375 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 9 Jan 2022 21:09:09 +0100 Subject: [PATCH 356/524] Bump version to 3.3.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e4c7e9..132351b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.3.0 - 2022-01-09 + ### Added - A KIT IPD crawler - Support for ILIAS cards diff --git a/PFERD/version.py b/PFERD/version.py index b8efadd..ca58f3a 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.2.0" +VERSION = "3.3.0" From 57ec51e95a238960d1832ba0ad85b2ff6ec1de3b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 14 Jan 2022 20:15:19 +0100 Subject: [PATCH 357/524] Fix login after shib url parser change --- CHANGELOG.md | 4 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 39 +++++++++++++++++++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 132351b..41ee3d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ ambiguous situations. ## Unreleased +### Fixed +- Shibboleth login fixed. It was broken due to URL parser changes and really + *unfortunate* behaviour by aiohttp. + ## 3.3.0 - 2022-01-09 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c3b2342..c26ce8b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -4,6 +4,7 @@ from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast import aiohttp +import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag @@ -674,14 +675,14 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + url = "https://ilias.studium.kit.edu/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "target": "/shib_login.php", - "home_organization_selection": "Mit KIT-Account anmelden", + "il_target": "", + "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _post(sess, url, data) + soup: BeautifulSoup = await _shib_post(sess, url, data) # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -761,3 +762,33 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) + + +async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: + """ + aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected + by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and + build encoded URL objects ourselfs... Who thought mangling location header was a good idea?? + """ + async with session.post(url, data=data, allow_redirects=False) as response: + location = response.headers.get("location") + if not location: + raise CrawlWarning(f"Login failed, no location header present at {url}") + correct_url = yarl.URL(location, encoded=True) + + async with session.get(correct_url, allow_redirects=False) as response: + as_yarl = yarl.URL(response.url) + location = response.headers.get("location") + + if not location or not as_yarl.host: + raise CrawlWarning(f"Login failed, no location header present at {correct_url}") + + correct_url = yarl.URL.build( + scheme=as_yarl.scheme, + host=as_yarl.host, + path=location, + encoded=True + ) + + async with session.get(correct_url, allow_redirects=False) as response: + return soupify(await response.read()) From f47e7374d23b71396b511ee7b57f59d46c34e00d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 14 Jan 2022 22:01:45 +0100 Subject: [PATCH 358/524] Use fixed windows path for video cache --- CHANGELOG.md | 4 +++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++++++-- PFERD/deduplicator.py | 6 ++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41ee3d5..7f35a90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,8 +23,10 @@ ambiguous situations. ## Unreleased ### Fixed -- Shibboleth login fixed. It was broken due to URL parser changes and really +- Shibboleth login. It was broken due to URL parser changes and really *unfortunate* behaviour by aiohttp. +- local video cache on windows if the path was changed to accomodate windows + file system limitations (e.g. replace `:`) ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c26ce8b..b197b6b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -499,7 +499,7 @@ instance's greatest bottleneck. log.explain_topic(f"Checking local cache for video {video_path.name}") all_found_locally = True for video in contained_videos: - transformed_path = self._transformer.transform(video) + transformed_path = self._to_local_video_path(video) if transformed_path: exists_locally = self._output_dir.resolve(transformed_path).exists() all_found_locally = all_found_locally and exists_locally @@ -509,6 +509,11 @@ instance's greatest bottleneck. log.explain("Missing at least one video, continuing with requests!") return False + def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]: + if transformed := self._transformer.transform(path): + return self._deduplicator.fixup_path(transformed) + return None + @anoncritical @_iorepeat(3, "downloading video") async def _download_video( @@ -528,7 +533,7 @@ instance's greatest bottleneck. log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] - transformed_path = self._transformer.transform(original_path) + transformed_path = self._to_local_video_path(original_path) if not transformed_path: raise CrawlError(f"Download returned a path but transform did not for {original_path}") diff --git a/PFERD/deduplicator.py b/PFERD/deduplicator.py index ef62dcb..7777f28 100644 --- a/PFERD/deduplicator.py +++ b/PFERD/deduplicator.py @@ -56,6 +56,12 @@ class Deduplicator: log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") return new_path + def fixup_path(self, path: PurePath) -> PurePath: + """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" + if self._windows_paths: + return self._fixup_for_windows(path) + return path + def mark(self, path: PurePath) -> PurePath: if self._windows_paths: path = self._fixup_for_windows(path) From 4f022e2d192552ddef22b169044f2692bc4e1563 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 15:06:02 +0100 Subject: [PATCH 359/524] Reword changelog --- CHANGELOG.md | 6 ++---- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f35a90..76cf836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,10 +23,8 @@ ambiguous situations. ## Unreleased ### Fixed -- Shibboleth login. It was broken due to URL parser changes and really - *unfortunate* behaviour by aiohttp. -- local video cache on windows if the path was changed to accomodate windows - file system limitations (e.g. replace `:`) +- ILIAS login +- Local video cache if `windows_paths` is enabled ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index b197b6b..a3e37a9 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -773,7 +773,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea """ aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and - build encoded URL objects ourselfs... Who thought mangling location header was a good idea?? + build encoded URL objects ourselves... Who thought mangling location header was a good idea?? """ async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") From 86947e4874f0853444e38de0fac4d2ddab5ae41e Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 15:11:22 +0100 Subject: [PATCH 360/524] Bump version to 3.3.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76cf836..d5f9dc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.3.1 - 2022-01-15 + ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled diff --git a/PFERD/version.py b/PFERD/version.py index ca58f3a..37e91f3 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.3.0" +VERSION = "3.3.1" From 7872fe5221c4c8b95b59ffe54f879c1c39e736f3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 18 Jan 2022 22:32:43 +0100 Subject: [PATCH 361/524] Fix tables with more columns than expected --- PFERD/crawl/ilias/kit_ilias_html.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 754af16..94b2e4b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -280,11 +280,22 @@ class IliasPage: def _listed_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. - # 6th child (1 indexed) is the modification time string - modification_string = link.parent.parent.parent.select_one( - "td.std:nth-child(6)" - ).getText().strip() - modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + # 6th or 7th child (1 indexed) is the modification time string. Try to find it + # by parsing backwards from the end and finding something that looks like a date + modification_time = None + row: Tag = link.parent.parent.parent + column_count = len(row.select("td.std")) + for index in range(column_count, 0, -1): + modification_string = link.parent.parent.parent.select_one( + f"td.std:nth-child({index})" + ).getText().strip() + if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): + modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + break + + if modification_time is None: + log.warn(f"Could not determine upload time for {link}") + modification_time = datetime.now() title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() title += ".mp4" From 86e2e226dcefb98232410cc2289d11a664076adc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 3 Apr 2022 11:32:38 +0200 Subject: [PATCH 362/524] Notify user when shibboleth presents new entitlements --- CHANGELOG.md | 2 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5f9dc6..4e11224 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ ambiguous situations. ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled +- Report when Shibboleth reviews entitlements +- Support for video listings with more columns ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a3e37a9..2a5fc87 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -710,6 +710,12 @@ class KitShibbolethLogin: } soup = await _post(sess, url, data) + if soup.find(id="attributeRelease"): + raise CrawlError( + "ILIAS Shibboleth entitlements changed! " + "Please log in once in your browser and review them" + ) + if self._tfa_required(soup): soup = await self._authenticate_tfa(sess, soup) From da72863b471c048768a0d8234ba02298b1f9e4c1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 3 Apr 2022 13:19:08 +0200 Subject: [PATCH 363/524] Placate newer mypy --- PFERD/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index e2d64fc..e833716 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -68,7 +68,7 @@ class Log: if self._download_progress.task_ids: elements.append(self._download_progress) - group = Group(*elements) # type: ignore + group = Group(*elements) self._live.update(group) @contextmanager From a2831fbea2e8758686677c44645fdd6f3cbc40fa Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 13:55:24 +0200 Subject: [PATCH 364/524] Fix shib authentication Authentication failed previously if the shib session was still valid. If Shibboleth gets a request and the session is still valid, it directly responds without a second redirect. --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 2a5fc87..571e4d7 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -784,15 +784,19 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") if not location: - raise CrawlWarning(f"Login failed, no location header present at {url}") + raise CrawlWarning(f"Login failed (1), no location header present at {url}") correct_url = yarl.URL(location, encoded=True) async with session.get(correct_url, allow_redirects=False) as response: - as_yarl = yarl.URL(response.url) location = response.headers.get("location") + # If shib still still has a valid session, it will directly respond to the request + if location is None: + return soupify(await response.read()) + as_yarl = yarl.URL(response.url) + # Probably not needed anymore, but might catch a few weird situations with a nicer message if not location or not as_yarl.host: - raise CrawlWarning(f"Login failed, no location header present at {correct_url}") + raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}") correct_url = yarl.URL.build( scheme=as_yarl.scheme, From f17b9b68f4cdc397b029361260d35aad7e778308 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 14:01:40 +0200 Subject: [PATCH 365/524] Add shibboleth authentication fix to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e11224..b3da789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ ambiguous situations. - Local video cache if `windows_paths` is enabled - Report when Shibboleth reviews entitlements - Support for video listings with more columns +- Authentication when the shib session is still valid ## 3.3.0 - 2022-01-09 From 07a21f80a63dfd4f47dae4dadc8e515334a9891d Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 21:15:33 +0200 Subject: [PATCH 366/524] Link to unofficial packages --- CHANGELOG.md | 3 +++ README.md | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3da789..c64b69a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Links to unofficial packages and repology in the readme + ## 3.3.1 - 2022-01-15 ### Fixed diff --git a/README.md b/README.md index 836147f..b8b2551 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,14 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. +### With package managers + +Unofficial packages are available for: +- [AUR](https://aur.archlinux.org/packages/pferd) +- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) + +See also PFERD's [repology page](https://repology.org/project/pferd/versions). + ## Basic usage PFERD can be run directly from the command line with no config file. Run `pferd From ba3d299c05bae299a3da5c378e9c5f311e78f62f Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 21:23:55 +0200 Subject: [PATCH 367/524] Fix changelog --- CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c64b69a..c5480f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,16 +23,18 @@ ambiguous situations. ## Unreleased ### Added +- Message when Shibboleth entitlements need to be manually reviewed +- Support for video listings with more columns - Links to unofficial packages and repology in the readme +### Fixed +- Crash during authentication when the Shibboleth session is still valid + ## 3.3.1 - 2022-01-15 ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled -- Report when Shibboleth reviews entitlements -- Support for video listings with more columns -- Authentication when the shib session is still valid ## 3.3.0 - 2022-01-09 From a99ddaa0cc28e04edfc95d541f0b1f6ca885965c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:47:51 +0200 Subject: [PATCH 368/524] Read and write config in UTF-8 --- PFERD/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index 0ea7abc..5635573 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -120,7 +120,7 @@ class Config: # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. try: - with open(path) as f: + with open(path, encoding="utf-8") as f: parser.read_file(f, source=str(path)) except FileNotFoundError: raise ConfigLoadError(path, "File does not exist") @@ -154,12 +154,12 @@ class Config: try: # x = open for exclusive creation, failing if the file already # exists - with open(path, "x") as f: + with open(path, "x", encoding="utf-8") as f: self._parser.write(f) except FileExistsError: print("That file already exists.") if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: self._parser.write(f) else: raise ConfigDumpError(path, "File already exists") From a709280cbf0bf5dbb62507f9829647862ef5f6bc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:48:09 +0200 Subject: [PATCH 369/524] Try to detect unsupported config file encoding The encoding detection is quite rudimentary, but should detect the default windows encoding in many cases. --- PFERD/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PFERD/config.py b/PFERD/config.py index 5635573..8f7e682 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -128,6 +128,8 @@ class Config: raise ConfigLoadError(path, "That's a directory, not a file") except PermissionError: raise ConfigLoadError(path, "Insufficient permissions") + except UnicodeDecodeError: + raise ConfigLoadError(path, "File is not encoded using UTF-8") def dump(self, path: Optional[Path] = None) -> None: """ From 00db34821825a719712f6bc25420bdfaed9bda11 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:53:29 +0200 Subject: [PATCH 370/524] Update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5480f2..e70d328 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,9 +24,12 @@ ambiguous situations. ### Added - Message when Shibboleth entitlements need to be manually reviewed -- Support for video listings with more columns - Links to unofficial packages and repology in the readme +### Changed +- Support video listings with more columns +- Use UTF-8 when reading/writing the config file + ### Fixed - Crash during authentication when the Shibboleth session is still valid From 31631fb409d80f7c0cf8dd964da993ef08aa6fe5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:16:47 +0200 Subject: [PATCH 371/524] Increase minimum python version to 3.9 --- .github/workflows/build-and-release.yml | 2 +- CHANGELOG.md | 1 + README.md | 2 +- setup.cfg | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 565c4e3..090ac7e 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python: ["3.8"] + python: ["3.9"] steps: - uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index e70d328..7cee430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. - Links to unofficial packages and repology in the readme ### Changed +- Increase minimum supported Python version to 3.9 - Support video listings with more columns - Use UTF-8 when reading/writing the config file diff --git a/README.md b/README.md index b8b2551..ce917b0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the ### With pip -Ensure you have at least Python 3.8 installed. Run the following command to +Ensure you have at least Python 3.9 installed. Run the following command to install PFERD or upgrade it to the latest version: ``` diff --git a/setup.cfg b/setup.cfg index 059798a..2378c48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ version = attr: PFERD.version.VERSION [options] packages = find: -python_requires = >=3.8 +python_requires = >=3.9 install_requires = aiohttp>=3.8.1 beautifulsoup4>=4.10.0 From 602044ff1b0b49348a50248f7f93334df979044a Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:50:06 +0200 Subject: [PATCH 372/524] Fix mypy errors and add missing await --- PFERD/crawl/crawler.py | 5 +++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 53f43e9..0e67c02 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -1,9 +1,10 @@ import asyncio import os from abc import ABC, abstractmethod +from collections.abc import Awaitable, Coroutine from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar from ..auth import Authenticator from ..config import Config, Section @@ -58,7 +59,7 @@ def noncritical(f: Wrapped) -> Wrapped: return wrapper # type: ignore -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) def anoncritical(f: AWrapped) -> AWrapped: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 571e4d7..ae9ebd4 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,7 +1,8 @@ import asyncio import re +from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast +from typing import Any, Callable, Dict, List, Optional, Set, Union, cast import aiohttp import yarl @@ -13,7 +14,7 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical +from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement @@ -82,8 +83,6 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) - def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: @@ -252,7 +251,7 @@ instance's greatest bottleneck. url: str, parent: IliasPageElement, path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: maybe_cl = await self.crawl(path) if not maybe_cl: return None @@ -310,7 +309,7 @@ instance's greatest bottleneck. self, parent_path: PurePath, element: IliasPageElement, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: if element.url in self._visited_urls: raise CrawlWarning( f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" @@ -360,7 +359,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -407,7 +406,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -443,7 +442,7 @@ instance's greatest bottleneck. if hdrs.LOCATION not in resp.headers: return soupify(await resp.read()).select_one("a").get("href").strip() - self._authenticate() + await self._authenticate() async with self.session.get(export_url, allow_redirects=False) as resp: # No redirect means we were authenticated @@ -456,7 +455,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: # Copy old mapping as it is likely still relevant if self.prev_report: self.report.add_custom_value( @@ -564,7 +563,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return None From d2e6d918806310a3bcda7a82c74853b7f59eb99f Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:50:36 +0200 Subject: [PATCH 373/524] Make PFERD executable via python -m --- PFERD/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index bdf5b34..4faeb13 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -159,3 +159,7 @@ def main() -> None: sys.exit(1) else: pferd.print_report() + + +if __name__ == "__main__": + main() From aa74604d293ec25ae7f94431d4431313dabfc26c Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:11:27 +0200 Subject: [PATCH 374/524] Use utf-8 for report --- PFERD/output_dir.py | 2 +- PFERD/report.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 441717b..c92f4a6 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -503,7 +503,7 @@ class OutputDirectory: try: self._prev_report = Report.load(self._report_path) log.explain("Loaded report successfully") - except (OSError, json.JSONDecodeError, ReportLoadError) as e: + except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: log.explain("Failed to load report") log.explain(str(e)) diff --git a/PFERD/report.py b/PFERD/report.py index 0e0c789..0eaaca9 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -100,10 +100,10 @@ class Report: @classmethod def load(cls, path: Path) -> "Report": """ - May raise OSError, JsonDecodeError, ReportLoadError. + May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. """ - with open(path) as f: + with open(path, encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict): @@ -148,7 +148,7 @@ class Report: "encountered_errors": self.encountered_errors, } - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, sort_keys=True) f.write("\n") # json.dump doesn't do this From b56475450de9a00a0ab12bfdf9adf9b5b229f38e Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:12:41 +0200 Subject: [PATCH 375/524] Use utf-8 for cookies --- PFERD/crawl/http_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index fa4cf29..44ec4dd 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -108,7 +108,7 @@ class HttpCrawler(Crawler): def _load_cookies_from_file(self, path: Path) -> None: jar: Any = http.cookies.SimpleCookie() - with open(path) as f: + with open(path, encoding="utf-8") as f: for i, line in enumerate(f): # Names of headers are case insensitive if line[:11].lower() == "set-cookie:": @@ -121,7 +121,7 @@ class HttpCrawler(Crawler): jar: Any = http.cookies.SimpleCookie() for morsel in self._cookie_jar: jar[morsel.key] = morsel - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(jar.output(sep="\n")) f.write("\n") # A trailing newline is just common courtesy From a8f76e9be76f4bb0ee24030ea252354ede1c8ce4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:15:12 +0200 Subject: [PATCH 376/524] Use utf-8 for credential file --- PFERD/auth/credential_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PFERD/auth/credential_file.py b/PFERD/auth/credential_file.py index d0fcdda..94ffa73 100644 --- a/PFERD/auth/credential_file.py +++ b/PFERD/auth/credential_file.py @@ -20,8 +20,10 @@ class CredentialFileAuthenticator(Authenticator): path = config.default_section.working_dir() / section.path() try: - with open(path) as f: + with open(path, encoding="utf-8") as f: lines = list(f) + except UnicodeDecodeError: + raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") except OSError as e: raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e From a241672726529d1a0ed852b1db2df7968ee6f137 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 1 May 2022 22:29:06 +0200 Subject: [PATCH 377/524] Bump version to 3.4.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cee430..310059a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.0 - 2022-05-01 + ### Added - Message when Shibboleth entitlements need to be manually reviewed - Links to unofficial packages and repology in the readme diff --git a/PFERD/version.py b/PFERD/version.py index 37e91f3..8102d37 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.3.1" +VERSION = "3.4.0" From b8fe25c580a8cafc14c32890f0635c7daecafc4d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 May 2022 14:13:39 +0200 Subject: [PATCH 378/524] Add `.cpp` to ipd link regex --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 310059a..22fdd29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Add `.cpp` to IPD link regex + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 1a5314b..e5ec58f 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$") + regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") return re.compile(regex) From afbd03f7774a1c0f22c471d98f995153bb08edcd Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:15:48 +0200 Subject: [PATCH 379/524] Fix docs --- CHANGELOG.md | 2 +- CONFIG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22fdd29..f5af29d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ ambiguous situations. ## Unreleased ### Changed -- Add `.cpp` to IPD link regex +- Add `cpp` extension to default `link_regex` of IPD crawler ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 569780d..1355c34 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler From bc3fa36637b5a4f4ea26db1a9437e4cbd5cad5c4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:20:45 +0200 Subject: [PATCH 380/524] Fix IPD crawler crashing on weird HTML comments --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5af29d..de7b795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +### Fixed +- IPD crawler crashes on some sites + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index e5ec58f..58e71f8 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler): async def get_page(self) -> BeautifulSoup: async with self.session.get(self._url) as request: - return soupify(await request.read()) + # The web page for Algorithmen für Routenplanung contains some + # weird comments that beautifulsoup doesn't parse correctly. This + # hack enables those pages to be crawled, and should hopefully not + # cause issues on other pages. + content = (await request.read()).decode("utf-8") + content = re.sub(r"", "", content) + return soupify(content.encode("utf-8")) From af2cc1169ace7154349518f7f709023eeb76ba95 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:23:19 +0200 Subject: [PATCH 381/524] Mention href for users of link_regex option --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de7b795..959fda0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +- Mention hrefs in IPD crawler for users of `link_regex` option ### Fixed - IPD crawler crashes on some sites diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 58e71f8..78fe0b1 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -45,7 +45,7 @@ class KitIpdFolder: def explain(self) -> None: log.explain_topic(f"Folder {self.name!r}") for file in self.files: - log.explain(f"File {file.name!r}") + log.explain(f"File {file.name!r} (href={file.url!r})") def __hash__(self) -> int: return self.name.__hash__() @@ -113,7 +113,7 @@ class KitIpdCrawler(HttpCrawler): else: file = self._extract_file(element) items.add(file) - log.explain_topic(f"Orphan file {file.name!r}") + log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items From 694ffb4d7711265d768a636cf1843e302485c62d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:28:30 +0200 Subject: [PATCH 382/524] Fix meeting date parsing Apparently the new pattern ": ," was added. This patch adds support for it. --- PFERD/crawl/ilias/kit_ilias_html.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 94b2e4b..dfe111d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -763,9 +763,14 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti """ try: date_str = re.sub(r"\s+", " ", date_str) + date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) + date_str = re.sub("(Heute|Today):", "", date_str, re.I) + date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) + date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" From bcc537468c46088f78a037fb28364866e8653bb5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:53:37 +0200 Subject: [PATCH 383/524] Fix crawling of expanded meetings The last meeting on every page is expanded by default. Its content is then shown inline *and* in the meeting page itself. We should skip the inline content. --- PFERD/crawl/ilias/kit_ilias_html.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index dfe111d..d93684c 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -428,6 +428,12 @@ class IliasPage: element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) + # The last meeting on every page is expanded by default. + # Its content is then shown inline *and* in the meeting page itself. + # We should skip the inline content. + if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link): + continue + if not element_type: continue if element_type == IliasElementType.MEETING: @@ -445,6 +451,26 @@ class IliasPage: return result + def _is_in_expanded_meeting(self, tag: Tag) -> bool: + """ + Returns whether a file is part of an expanded meeting. + Has false positives for meetings themselves as their title is also "in the expanded meeting content". + It is in the same general div and this whole thing is guesswork. + Therefore, you should check for meetings before passing them in this function. + """ + parents: List[Tag] = list(tag.parents) + for parent in parents: + if not parent.get("class"): + continue + + # We should not crawl files under meetings + if "ilContainerListItemContentCB" in parent.get("class"): + link: Tag = parent.parent.find("a") + type = IliasPage._find_type_from_folder_like(link, self._page_url) + return type == IliasElementType.MEETING + + return False + def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: """ Interprets accordions and expandable blocks as virtual folders and returns them From 2f0e04ce13ebbc7c7ccaa93e03d8f707f246ceef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:57:55 +0200 Subject: [PATCH 384/524] Adjust changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 959fda0..4249287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ ambiguous situations. ### Fixed - IPD crawler crashes on some sites +- Meeting name normalization for yesterday, today and tomorrow fails +- Crawling of meeting file previews ## 3.4.0 - 2022-05-01 From 616b0480f7c92afe11c36d2c105c99ba5f960e96 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 8 May 2022 17:39:18 +0200 Subject: [PATCH 385/524] Simplify IPD crawler link regex --- CHANGELOG.md | 5 +++-- CONFIG.md | 2 +- PFERD/crawl/kit_ipd_crawler.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4249287..e2d3840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,11 +24,12 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler -- Mention hrefs in IPD crawler for users of `link_regex` option +- Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option +- Simplify default IPD crawler `link_regex` ### Fixed - IPD crawler crashes on some sites -- Meeting name normalization for yesterday, today and tomorrow fails +- Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 1355c34..f572a80 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) + files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 78fe0b1..d9fac32 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") + regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") return re.compile(regex) From a5015fe9b16d484613a27687f2c122b15e109ba2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 8 May 2022 23:21:18 +0200 Subject: [PATCH 386/524] Correctly parse day-only meeting dates I failed to recognize the correct format in the previous adjustment, so this (hopefully) fixes it for good. Meetings apparently don't always have a time portion. --- PFERD/crawl/ilias/kit_ilias_html.py | 48 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d93684c..6d063b6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -746,17 +746,26 @@ class IliasPage: Normalizes meeting names, which have a relative time as their first part, to their date in ISO format. """ - date_portion_str = meeting_name.split(" - ")[0] + + # This checks whether we can reach a `:` without passing a `-` + if re.search(r"^[^-]+: ", meeting_name): + # Meeting name only contains date: "05. Jan 2000:" + split_delimiter = ":" + else: + # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" + split_delimiter = ", " + + # We have a meeting day without time + date_portion_str = meeting_name.split(split_delimiter)[0] date_portion = demangle_date(date_portion_str) + # We failed to parse the date, bail out if not date_portion: return meeting_name - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + # Replace the first section with the absolute date + rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) + return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name def _abs_url_from_link(self, link_tag: Tag) -> str: """ @@ -781,17 +790,15 @@ english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ - Demangle a given date in one of the following formats: + Demangle a given date in one of the following formats (hour/minute part is optional): "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" "dd. mon yyyy, HH:MM """ try: + # Normalize whitespace because users date_str = re.sub(r"\s+", " ", date_str) - date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) - date_str = re.sub("(Heute|Today):", "", date_str, re.I) - date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) @@ -802,19 +809,28 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" date_str = date_str.replace(english + ".", english) - # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" - day_part, time_part = date_str.split(",") + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" + + # Check if we have a time as well + if ", " in date_str: + day_part, time_part = date_str.split(",") + else: + day_part = date_str.split(",")[0] + time_part = None + day_str, month_str, year_str = day_part.split(" ") day = int(day_str.strip().replace(".", "")) month = english_months.index(month_str.strip()) + 1 year = int(year_str.strip()) - hour_str, minute_str = time_part.split(":") - hour = int(hour_str) - minute = int(minute_str) + if time_part: + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + return datetime(year, month, day, hour, minute) - return datetime(year, month, day, hour, minute) + return datetime(year, month, day) except Exception: if not fail_silently: log.warn(f"Date parsing failed for {date_str!r}") From 846c29aee1867f7f0b7efae802af47fee77a3ec6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 11 May 2022 21:16:09 +0200 Subject: [PATCH 387/524] Download page descriptions --- CHANGELOG.md | 3 + PFERD/crawl/ilias/ilias_html_cleaner.py | 91 ++++++++++++++++++++++ PFERD/crawl/ilias/kit_ilias_html.py | 25 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++ 4 files changed, 148 insertions(+) create mode 100644 PFERD/crawl/ilias/ilias_html_cleaner.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d3840..b7cad13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Download of page descriptions + ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py new file mode 100644 index 0000000..5952309 --- /dev/null +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup, Comment, Tag + +_STYLE_TAG_CONTENT = """ + .ilc_text_block_Information { + background-color: #f5f7fa; + } + div.ilc_text_block_Standard { + margin-bottom: 10px; + margin-top: 10px; + } + span.ilc_text_inline_Strong { + font-weight: bold; + } + + .accordion-head { + background-color: #f5f7fa; + padding: 0.5rem 0; + } + + h3 { + margin-top: 0.5rem; + margin-bottom: 1rem; + } + + br.visible-break { + margin-bottom: 1rem; + } + + article { + margin: 0.5rem 0; + } + + body { + padding: 1em; + grid-template-columns: 1fr min(60rem, 90%) 1fr; + line-height: 1.2; + } +""" + +_ARTICLE_WORTHY_CLASSES = [ + "ilc_text_block_Information", + "ilc_section_Attention", + "ilc_section_Link", +] + + +def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: + head = soup.new_tag("head") + soup.insert(0, head) + + simplecss_link: Tag = soup.new_tag("link") + # + simplecss_link["rel"] = "stylesheet" + simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" + head.append(simplecss_link) + + # Basic style tags for compat + style: Tag = soup.new_tag("style") + style.append(_STYLE_TAG_CONTENT) + head.append(style) + + return soup + + +def clean(soup: BeautifulSoup) -> BeautifulSoup: + for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + block.name = "article" + + for block in soup.find_all("h3"): + block.name = "div" + + for block in soup.find_all("h1"): + block.name = "h3" + + for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + block.name = "h3" + block["class"] += ["accordion-head"] + + for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): + children = list(dummy.children) + if not children: + dummy.decompose() + if len(children) > 1: + continue + if type(children[0]) == Comment: + dummy.decompose() + + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + hrule_imposter.insert(0, soup.new_tag("hr")) + + return soup diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 6d063b6..d58e5c8 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -85,6 +85,31 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_description(self) -> Optional[BeautifulSoup]: + def is_interesting_class(name: str) -> bool: + return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] + + paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + if not paragraphs: + return None + + # Extract bits and pieces into a string and parse it again. + # This ensures we don't miss anything and weird structures are resolved + # somewhat gracefully. + raw_html = "" + for p in paragraphs: + if p.find_parent(class_=is_interesting_class): + continue + + # Ignore special listings (like folder groupings) + if "ilc_section_Special" in p["class"]: + continue + + raw_html += str(p) + "\n" + raw_html = f"\n{raw_html}\n" + + return BeautifulSoup(raw_html, "html.parser") + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae9ebd4..bbed986 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links +from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -215,6 +216,8 @@ instance's greatest bottleneck. cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling url") async def gather_elements() -> None: @@ -233,9 +236,15 @@ instance's greatest bottleneck. page = IliasPage(soup, url, None) elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) + # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -265,6 +274,8 @@ instance's greatest bottleneck. cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling folder") async def gather_elements() -> None: @@ -285,10 +296,15 @@ instance's greatest bottleneck. next_stage_url = None elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -425,6 +441,19 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical + @_iorepeat(1, "downloading description") + async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: + path = parent_path / "Description.html" + dl = await self.download(path, redownload=Redownload.ALWAYS) + if not dl: + return + + async with dl as (bar, sink): + description = clean(insert_base_markup(description)) + sink.file.write(description.prettify().encode("utf-8")) + sink.done() + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking( From 46fb782798725b6fde76b71cf7a4d90912ea2c7d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 24 May 2022 23:28:09 +0200 Subject: [PATCH 388/524] Add forum crawling This downloads all forum posts when needed and saves each thread in its own html file, named after the thread title. --- CHANGELOG.md | 1 + PFERD/cli/command_kit_ilias_web.py | 7 ++ PFERD/crawl/ilias/kit_ilias_html.py | 90 ++++++++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 122 ++++++++++++++++++--- PFERD/logging.py | 4 +- 5 files changed, 208 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7cad13..1d70c4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Added - Download of page descriptions +- Forum download support ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 12803a6..de74fc3 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -62,6 +62,11 @@ GROUP.add_argument( action=BooleanOptionalAction, help="crawl and download videos" ) +GROUP.add_argument( + "--forums", + action=BooleanOptionalAction, + help="crawl and download forum posts" +) GROUP.add_argument( "--http-timeout", "-t", type=float, @@ -90,6 +95,8 @@ def load( section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: section["videos"] = "yes" if args.videos else "no" + if args.forums is not None: + section["forums"] = "yes" if args.forums else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d58e5c8..7bab152 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -55,6 +55,20 @@ class IliasPageElement: return self.url +@dataclass +class IliasDownloadForumData: + url: str + form_data: Dict[str, Union[str, List[str]]] + + +@dataclass +class IliasForumThread: + title: str + title_tag: Tag + content_tag: Tag + mtime: Optional[datetime] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -110,13 +124,39 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: + form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + if not form: + return None + post_url = self._abs_url_from_relative(form["action"]) + + form_data: Dict[str, Union[str, List[ſtr]]] = { + "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "selected_cmd2": "html", + "select_cmd2": "Ausführen", + "selected_cmd": "", + } + + return IliasDownloadForumData(post_url, form_data) + def get_next_stage_element(self) -> Optional[IliasPageElement]: + if self._is_forum_page(): + if "trows=800" in self._page_url: + return None + return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: return self._find_video_entries_paginated()[0] return None + def _is_forum_page(self) -> bool: + read_more_btn = self._soup.find( + "button", + attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + ) + return read_more_btn is not None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) @@ -194,6 +234,19 @@ class IliasPage: return items + def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: + correct_link = self._soup.find( + "a", + attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} + ) + + if not correct_link: + return None + + link = self._abs_url_from_link(correct_link) + + return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] @@ -877,3 +930,38 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: + elements = [] + for p in forum_export.select("body > p"): + title_tag = p + content_tag = p.find_next_sibling("ul") + title = p.find("b").text + if ":" in title: + title = title[title.find(":") + 1:] + title = title.strip() + mtime = _guess_timestamp_from_forum_post_content(content_tag) + elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) + + return elements + + +def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: + posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") + if not posts: + return None + + newest_date: Optional[datetime] = None + + for post in posts: + text = post.text.strip() + text = text[text.rfind("|") + 1:] + date = demangle_date(text, fail_silently=True) + if not date: + continue + + if not newest_date or newest_date < date: + newest_date = date + + return newest_date diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bbed986..156cd4c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -18,7 +18,8 @@ from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadTo from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, + _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -67,6 +68,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) + def forums(self) -> bool: + return self.s.getboolean("forums", fallback=False) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -183,6 +187,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._forums = section.forums() self._visited_urls: Set[str] = set() async def _run(self) -> None: @@ -335,22 +340,27 @@ instance's greatest bottleneck. element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: - log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") if not self._videos: - log.explain("Video crawling is disabled") - log.explain("Answer: no") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'videos')" + ) return None - else: - log.explain("Video crawling is enabled") - log.explain("Answer: yes") if element.type == IliasElementType.FILE: return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Forums are not supported") - log.explain("Answer: No") - return None + if not self._forums: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'forums')" + ) + return None + return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") @@ -635,6 +645,68 @@ instance's greatest bottleneck. if not await try_stream(): raise CrawlError("File streaming failed after authenticate()") + async def _handle_forum( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_forum(element, maybe_cl) + + @_iorepeat(3, "crawling forum") + @anoncritical + async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements = [] + + async with cl: + next_stage_url = element.url + while next_stage_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + + soup = await self._get_page(next_stage_url) + page = IliasPage(soup, next_stage_url, None) + + if next := page.get_next_stage_element(): + next_stage_url = next.url + else: + break + + download_data = page.get_download_forum_data() + if not download_data: + raise CrawlWarning("Failed to extract forum data") + html = await self._post_authenticated(download_data.url, download_data.form_data) + elements = parse_ilias_forum_export(soupify(html)) + + elements.sort(key=lambda elem: elem.title) + + tasks: List[Awaitable[None]] = [] + for elem in elements: + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + + # And execute them + await self.gather(tasks) + + @anoncritical + @_iorepeat(3, "saving forum thread") + async def _download_forum_thread( + self, + parent_path: PurePath, + element: IliasForumThread, + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path, mtime=element.mtime) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + content = element.title_tag.prettify() + content += element.content_tag.prettify() + sink.file.write(content.encode("utf-8")) + sink.done() + async def _get_page(self, url: str) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -652,13 +724,37 @@ instance's greatest bottleneck. return soup raise CrawlError("get_page failed even after authenticating") + async def _post_authenticated( + self, + url: str, + data: dict[str, Union[str, List[str]]] + ) -> BeautifulSoup: + auth_id = await self._current_auth_id() + + form_data = aiohttp.FormData() + for key, val in data.items(): + form_data.add_field(key, val) + + async with self.session.post(url, data=form_data(), allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.post(url, data=data, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("post_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login", failure_is_error=True) + @ _iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @staticmethod + @ staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") diff --git a/PFERD/logging.py b/PFERD/logging.py index e833716..340b21f 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -197,7 +197,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def status(self, style: str, action: str, text: str) -> None: + def status(self, style: str, action: str, text: str, suffix: str = "") -> None: """ Print a status update while crawling. Allows markup in the "style" argument which will be applied to the "action" string. @@ -205,7 +205,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_status: action = escape(f"{action:<{self.STATUS_WIDTH}}") - self.print(f"{style}{action}[/] {escape(text)}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") def report(self, text: str) -> None: """ From ed24366aba7cfb8ca3cdd0df7b2650bc1220437f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 16:23:37 +0100 Subject: [PATCH 389/524] Add pass authenticator --- CHANGELOG.md | 1 + CONFIG.md | 21 ++++++++- PFERD/auth/__init__.py | 3 ++ PFERD/auth/pass_.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 PFERD/auth/pass_.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d70c4a..bc9f3e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - Download of page descriptions - Forum download support +- `pass` authenticator ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/CONFIG.md b/CONFIG.md index f572a80..0f114ed 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -223,6 +223,23 @@ is stored in the keyring. - `keyring_name`: The service name PFERD uses for storing credentials. (Default: `PFERD`) +### The `pass` authenticator + +This authenticator queries the [`pass` password manager][3] for a username and +password. It tries to be mostly compatible with [browserpass][4] and +[passff][5], so see those links for an overview of the format. If PFERD fails +to load your password, you can use the `--explain` flag to see why. + +- `passname`: The name of the password to use (Required) +- `username_prefixes`: A comma-separated list of username line prefixes + (Default: `login,username,user`) +- `password_prefixes`: A comma-separated list of password line prefixes + (Default: `password,pass,secret`) + +[3]: "Pass: The Standard Unix Password Manager" +[4]: "Organizing password store" +[5]: "Multi-line format" + ### The `tfa` authenticator This authenticator prompts the user on the console for a two-factor @@ -316,7 +333,7 @@ is a regular expression and `TARGET` an f-string based template. If a path matches `SOURCE`, the output path is created using `TARGET` as template. `SOURCE` is automatically anchored. -`TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can +`TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can be referred to as `{g}` (e.g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is available as `{i}` (e.g. `{i3}`). If capturing group *n*'s contents are a @@ -337,7 +354,7 @@ Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` - Converts `fooooo/bear` into `BOOOOOH/fear` - Converts `foo/bar/baz` into `BOOH/fear/baz` -[3]: "Format String Syntax" +[6]: "Format String Syntax" ### The `-name-re->` arrow diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 277cade..aa3ba8e 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -5,6 +5,7 @@ from ..config import Config from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection # noqa: F401 from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection from .keyring import KeyringAuthenticator, KeyringAuthSection +from .pass_ import PassAuthenticator, PassAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -19,6 +20,8 @@ AUTHENTICATORS: Dict[str, AuthConstructor] = { CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), "keyring": lambda n, s, c: KeyringAuthenticator(n, KeyringAuthSection(s)), + "pass": lambda n, s, c: + PassAuthenticator(n, PassAuthSection(s)), "simple": lambda n, s, c: SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: diff --git a/PFERD/auth/pass_.py b/PFERD/auth/pass_.py new file mode 100644 index 0000000..4c8e775 --- /dev/null +++ b/PFERD/auth/pass_.py @@ -0,0 +1,98 @@ +import re +import subprocess +from typing import List, Tuple + +from ..logging import log +from .authenticator import Authenticator, AuthError, AuthSection + + +class PassAuthSection(AuthSection): + def passname(self) -> str: + if (value := self.s.get("passname")) is None: + self.missing_value("passname") + return value + + def username_prefixes(self) -> List[str]: + value = self.s.get("username_prefixes", "login,username,user") + return [prefix.lower() for prefix in value.split(",")] + + def password_prefixes(self) -> List[str]: + value = self.s.get("password_prefixes", "password,pass,secret") + return [prefix.lower() for prefix in value.split(",")] + + +class PassAuthenticator(Authenticator): + PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)" # to be used with fullmatch + + def __init__(self, name: str, section: PassAuthSection) -> None: + super().__init__(name) + + self._passname = section.passname() + self._username_prefixes = section.username_prefixes() + self._password_prefixes = section.password_prefixes() + + async def credentials(self) -> Tuple[str, str]: + log.explain_topic("Obtaining credentials from pass") + + try: + log.explain(f"Calling 'pass show {self._passname}'") + result = subprocess.check_output(["pass", "show", self._passname], text=True) + except subprocess.CalledProcessError as e: + raise AuthError(f"Failed to get password info from {self._passname}: {e}") + + prefixed = {} + unprefixed = [] + for line in result.strip().splitlines(): + if match := re.fullmatch(self.PREFIXED_LINE_RE, line): + prefix = match.group(1).lower() + value = match.group(2) + log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") + if prefix in prefixed: + raise AuthError(f"Prefix {prefix} specified multiple times") + prefixed[prefix] = value + else: + log.explain(f"Found unprefixed line {line!r}") + unprefixed.append(line) + + username = None + for prefix in self._username_prefixes: + log.explain(f"Looking for username at prefix {prefix!r}") + if prefix in prefixed: + username = prefixed[prefix] + log.explain(f"Found username {username!r}") + break + + password = None + for prefix in self._password_prefixes: + log.explain(f"Looking for password at prefix {prefix!r}") + if prefix in prefixed: + password = prefixed[prefix] + log.explain(f"Found password {password!r}") + break + + if password is None and username is None: + log.explain("No username and password found so far") + log.explain("Using first unprefixed line as password") + log.explain("Using second unprefixed line as username") + elif password is None: + log.explain("No password found so far") + log.explain("Using first unprefixed line as password") + elif username is None: + log.explain("No username found so far") + log.explain("Using first unprefixed line as username") + + if password is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Password could not be determined") + password = unprefixed.pop(0) + log.explain(f"Found password {password!r}") + + if username is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Username could not be determined") + username = unprefixed.pop(0) + log.explain(f"Found username {username!r}") + + return username, password From 345f52a1f6f55eecf6c31d3cc1a4350c5200087d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:41:29 +0200 Subject: [PATCH 390/524] Detect new login button --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 156cd4c..c99a920 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -759,7 +759,7 @@ instance's greatest bottleneck. # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: - login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login From d9b111cec252f4b1810f06b0f2ca551cb5cdb2a2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:45:33 +0200 Subject: [PATCH 391/524] Correctly nest description entries --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c99a920..1852c5f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -308,7 +308,7 @@ instance's greatest bottleneck. await gather_elements() if description: - await self._download_description(PurePath("."), description[0]) + await self._download_description(cl.path, description[0]) elements.sort(key=lambda e: e.id()) From aa5a3a10bcbfa0dd54a0dc1a533625f76b2d6ed8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:48:59 +0200 Subject: [PATCH 392/524] Adjust changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc9f3e5..7f35c9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,9 @@ ambiguous situations. - IPD crawler crashes on some sites - Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews +- Login with new login button html layout +- Descriptions for courses are now placed in the correct subfolder when + downloading the whole desktop ## 3.4.0 - 2022-05-01 From 66a5b1ba0223848f713192b084f2dcd26a18dbe5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 17 Aug 2022 13:24:01 +0200 Subject: [PATCH 393/524] Bump version to 3.4.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f35c9c..671d48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.1 - 2022-08-17 + ### Added - Download of page descriptions - Forum download support diff --git a/PFERD/version.py b/PFERD/version.py index 8102d37..8832a51 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.0" +VERSION = "3.4.1" From 4a51aaa4f5a1b3382f0bed59f1292fc0952c2832 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 Oct 2022 22:59:33 +0200 Subject: [PATCH 394/524] Fix forum crawling crashing for empty threads --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 671d48a..70d2cd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Forum crawling crashing when parsing empty (= 0 messages) threads + ## 3.4.1 - 2022-08-17 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7bab152..8795512 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -937,6 +937,13 @@ def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThre for p in forum_export.select("body > p"): title_tag = p content_tag = p.find_next_sibling("ul") + + if not content_tag: + # ILIAS allows users to delete the initial post while keeping the thread open + # This produces empty threads without *any* content. + # I am not sure why you would want this, but ILIAS makes it easy to do. + continue + title = p.find("b").text if ":" in title: title = title[title.find(":") + 1:] From d72fc2760b1dd8243ccf21876bb8cc6e027944bb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:09:29 +0200 Subject: [PATCH 395/524] Handle empty forums --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70d2cd5..c7a9899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a forum has no threads at all ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 8795512..9ea6b9f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -59,6 +59,7 @@ class IliasPageElement: class IliasDownloadForumData: url: str form_data: Dict[str, Union[str, List[str]]] + empty: bool @dataclass @@ -130,14 +131,16 @@ class IliasPage: return None post_url = self._abs_url_from_relative(form["action"]) + thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + form_data: Dict[str, Union[str, List[ſtr]]] = { - "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", } - return IliasDownloadForumData(post_url, form_data) + return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_forum_page(): diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 1852c5f..f2d5215 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -658,7 +658,7 @@ instance's greatest bottleneck. @_iorepeat(3, "crawling forum") @anoncritical async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: - elements = [] + elements: List[IliasForumThread] = [] async with cl: next_stage_url = element.url @@ -677,6 +677,10 @@ instance's greatest bottleneck. download_data = page.get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") + if download_data.empty: + log.explain("Forum had no threads") + elements = [] + return html = await self._post_authenticated(download_data.url, download_data.form_data) elements = parse_ilias_forum_export(soupify(html)) From fb4631ba180a9ff0303d59e798d4bccfa0253666 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:13:36 +0200 Subject: [PATCH 396/524] Fix ilias background login --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index f2d5215..10a270f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -23,6 +23,12 @@ from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, Ilia TargetType = Union[str, int] +_ILIAS_URL = "https://ilias.studium.kit.edu" + + +class KitShibbolethBackgroundLoginSuccessful(): + pass + class KitIliasWebCrawlerSection(HttpCrawlerSection): def target(self) -> TargetType: @@ -36,7 +42,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): if target == "desktop": # Full personal desktop return target - if target.startswith("https://ilias.studium.kit.edu"): + if target.startswith(_ILIAS_URL): # ILIAS URL return target @@ -181,7 +187,7 @@ instance's greatest bottleneck. section.tfa_auth(authenticators), ) - self._base_url = "https://ilias.studium.kit.edu" + self._base_url = _ILIAS_URL self._target = section.target() self._link_file_redirect_delay = section.link_redirect_delay() @@ -808,14 +814,17 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/shib_login.php" + url = f"{_ILIAS_URL}/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "il_target": "", "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _shib_post(sess, url, data) + soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) + + if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): + return # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -854,7 +863,7 @@ class KitShibbolethLogin: # (or clicking "Continue" if you have JS disabled) relay_state = soup.find("input", {"name": "RelayState"}) saml_response = soup.find("input", {"name": "SAMLResponse"}) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" data = { # using the info obtained in the while loop above "RelayState": relay_state["value"], "SAMLResponse": saml_response["value"], @@ -903,22 +912,35 @@ async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> Beautifu return soupify(await response.read()) -async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: +async def _shib_post( + session: aiohttp.ClientSession, + url: str, + data: Any +) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: """ aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and build encoded URL objects ourselves... Who thought mangling location header was a good idea?? """ + log.explain_topic("Shib login POST") async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Got location {location!r}") if not location: raise CrawlWarning(f"Login failed (1), no location header present at {url}") correct_url = yarl.URL(location, encoded=True) + log.explain(f"Corrected location to {correct_url!r}") + + if str(correct_url).startswith(_ILIAS_URL): + log.explain("ILIAS recognized our shib token and logged us in in the background, returning") + return KitShibbolethBackgroundLoginSuccessful() async with session.get(correct_url, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Redirected to {location!r} with status {response.status}") # If shib still still has a valid session, it will directly respond to the request if location is None: + log.explain("Shib recognized us, returning its response directly") return soupify(await response.read()) as_yarl = yarl.URL(response.url) @@ -932,6 +954,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea path=location, encoded=True ) + log.explain(f"Corrected location to {correct_url!r}") async with session.get(correct_url, allow_redirects=False) as response: return soupify(await response.read()) From 5fdd40204b156b15c008ec1dee05e168672fe243 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 14:33:58 +0200 Subject: [PATCH 397/524] Unwrap future meetings when ILIAS hides them behind a pagination --- PFERD/crawl/ilias/kit_ilias_html.py | 20 +++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 ++++++++++++++-------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9ea6b9f..2f0011e 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -146,11 +146,17 @@ class IliasPage: if self._is_forum_page(): if "trows=800" in self._page_url: return None + log.explain("Requesting *all* forum threads") return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): + log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + log.explain("Unwrapping video pagination") return self._find_video_entries_paginated()[0] + if self._contains_collapsed_future_meetings(): + log.explain("Requesting *all* future meetings") + return self._uncollapse_future_meetings_url() return None def _is_forum_page(self) -> bool: @@ -203,6 +209,16 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: + return self._uncollapse_future_meetings_url() is not None + + def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: + element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) + if not element: + return None + link = self._abs_url_from_link(element) + return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -793,6 +809,10 @@ class IliasPage: if img_tag is None: img_tag = found_parent.select_one("img.icon") + if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): + log.explain("Found session expansion button, skipping it as it has no content") + return None + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 10a270f..bc0d816 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -234,19 +234,28 @@ instance's greatest bottleneck. async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) - - if expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError("Invalid course id? Didn't find anything looking like a course") + next_stage_url: Optional[str] = url + current_parent = None # Duplicated code, but the root page is special - we want to avoid fetching it twice! - log.explain_topic("Parsing root HTML page") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, None) - elements.extend(page.get_child_elements()) + while next_stage_url: + soup = await self._get_page(next_stage_url) + if current_parent is None and expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + raise CrawlError("Invalid course id? Didn't find anything looking like a course") + + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None + + elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string) From e1430e629844ad122a78d18197ed54100c734bbb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:34 +0200 Subject: [PATCH 398/524] Handle (and ignore) surveys --- PFERD/crawl/ilias/kit_ilias_html.py | 3 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 2f0011e..d969577 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -24,6 +24,7 @@ class IliasElementType(Enum): LINK = "link" BOOKING = "booking" MEETING = "meeting" + SURVEY = "survey" VIDEO = "video" VIDEO_PLAYER = "video_player" VIDEO_FOLDER = "video_folder" @@ -730,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "svy" in icon["class"]: + return IliasElementType.SURVEY _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bc0d816..5ff8212 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -380,6 +380,13 @@ instance's greatest bottleneck. log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") log.explain("Answer: No") + elif element.type == IliasElementType.SURVEY: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](surveys contain no relevant data)" + ) return None elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) From 1b6be6bd79112faea6e56c43f4756dde10ba00ba Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:54 +0200 Subject: [PATCH 399/524] Handle content pages in cards --- PFERD/crawl/ilias/kit_ilias_html.py | 2 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d969577..ee0364a 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -731,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "copa" in icon["class"]: + return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5ff8212..9295e93 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -377,9 +377,13 @@ instance's greatest bottleneck. return None return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Tests contain no relevant files") - log.explain("Answer: No") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](tests contain no relevant data)" + ) + return None elif element.type == IliasElementType.SURVEY: log.status( "[bold bright_black]", From f47d2f11d843bfd3307815b231dd3e3df0265cef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 Oct 2022 20:28:06 +0200 Subject: [PATCH 400/524] Append trailing slash to kit-ipd links to ensure urljoin works as expected --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a9899..24d9fa6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads - Forum crawling crashing when a forum has no threads at all +- kit-ipd crawler if URL did not end with a trailing slash ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index d9fac32..338e059 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -24,6 +24,9 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") + if not target.endswith("/"): + target = target + "/" + return target def link_regex(self) -> Pattern[str]: From 37b51a66d87d368afc3bef2b81edf1629f95cd57 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:22:37 +0200 Subject: [PATCH 401/524] Update changelog --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24d9fa6..2bb0231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,10 +22,16 @@ ambiguous situations. ## Unreleased +### Added +- Recognize and crawl content pages in cards +- Recognize and ignore surveys + ### Fixed -- Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a thread has no messages at all - Forum crawling crashing when a forum has no threads at all -- kit-ipd crawler if URL did not end with a trailing slash +- Ilias login failing in some cases +- Crawling of paginated future meetings +- IPD crawler handling of URLs without trailing slash ## 3.4.1 - 2022-08-17 From 259cfc20cccae68a2f34984796405a35a7f31707 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:26:17 +0200 Subject: [PATCH 402/524] Bump version to 3.4.2 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb0231..9ecddf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.2 - 2022-10-26 + ### Added - Recognize and crawl content pages in cards - Recognize and ignore surveys diff --git a/PFERD/version.py b/PFERD/version.py index 8832a51..0ef5d89 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.1" +VERSION = "3.4.2" From c020cccc64f152882688b119416f0582ec94e074 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Oct 2022 14:08:29 +0200 Subject: [PATCH 403/524] Include found paths in "second path found" warning --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 8 +++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ecddf7..3dd25b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Clear up error message shown when multiple paths are found to an element + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ee0364a..56dcf7b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -134,7 +134,7 @@ class IliasPage: thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] - form_data: Dict[str, Union[str, List[ſtr]]] = { + form_data: Dict[str, Union[str, List[str]]] = { "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 9295e93..e3719b8 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -194,7 +194,7 @@ instance's greatest bottleneck. self._links = section.links() self._videos = section.videos() self._forums = section.forums() - self._visited_urls: Set[str] = set() + self._visited_urls: Dict[str, PurePath] = dict() async def _run(self) -> None: if isinstance(self._target, int): @@ -348,9 +348,11 @@ instance's greatest bottleneck. ) -> Optional[Coroutine[Any, Any, None]]: if element.url in self._visited_urls: raise CrawlWarning( - f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + f"Found second path to element {element.name!r} at {element.url!r}. " + + f"First path: {fmt_path(self._visited_urls[element.url])}. " + + f"Second path: {fmt_path(parent_path)}." ) - self._visited_urls.add(element.url) + self._visited_urls[element.url] = parent_path element_path = PurePath(parent_path, element.name) From 07200bbde5fb72f2f846101b92b440724c8c7959 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 Oct 2022 14:10:45 +0100 Subject: [PATCH 404/524] Document ilias web crawler's forums option --- CHANGELOG.md | 3 +++ CONFIG.md | 1 + 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dd25b8..e5e81d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Missing documentation for `forums` option + ### Changed - Clear up error message shown when multiple paths are found to an element diff --git a/CONFIG.md b/CONFIG.md index 0f114ed..1ca43c4 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -181,6 +181,7 @@ script once per day should be fine. redirect to the actual URL. Set to a negative value to disable the automatic redirect. (Default: `-1`) - `videos`: Whether to download videos. (Default: `no`) +- `forums`: Whether to download forum threads. (Default: `no`) - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: `20.0`) From e69b55b3496d58bc19d76429ca0078ab10f23074 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Fri, 4 Nov 2022 12:18:26 +0100 Subject: [PATCH 405/524] Add more unofficial package managers (#66) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ce917b0..31a3475 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. Unofficial packages are available for: - [AUR](https://aur.archlinux.org/packages/pferd) +- [brew](https://formulae.brew.sh/formula/pferd) +- [conda-forge](https://github.com/conda-forge/pferd-feedstock) - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) +- [PyPi](https://pypi.org/project/pferd) See also PFERD's [repology page](https://repology.org/project/pferd/versions). From 635caa765decd9a747d8b313252fd6b56cea0951 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 15 Nov 2022 17:17:55 +0100 Subject: [PATCH 406/524] Fix typo Thanks, burg113 --- CONFIG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONFIG.md b/CONFIG.md index 1ca43c4..640e4af 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -290,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`. Example: `foo/bar --> baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Converts `foo/bar` into `baz` -- Converts `foo/bar/wargl` into `bar/wargl` +- Converts `foo/bar/wargl` into `baz/wargl` Example: `foo/bar --> !` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` From c0d6d8b22975234b0c9141a22307c8036698566c Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 21 Nov 2022 17:53:30 +0100 Subject: [PATCH 407/524] Use url after redirect for relative links --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 27 ++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5e81d6..5bbefd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ ambiguous situations. ### Changed - Clear up error message shown when multiple paths are found to an element +### Fixed +- IPD crawler unnecessarily appending trailing slashes + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 338e059..c852be0 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,7 +2,7 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from typing import Awaitable, List, Optional, Pattern, Set, Union +from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -24,9 +24,6 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") - if not target.endswith("/"): - target = target + "/" - return target def link_regex(self) -> Pattern[str]: @@ -102,32 +99,32 @@ class KitIpdCrawler(HttpCrawler): await self._stream_from_url(file.url, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: - page = await self.get_page() + page, url = await self.get_page() elements: List[Tag] = self._find_file_links(page) items: Set[Union[KitIpdFile, KitIpdFolder]] = set() for element in elements: folder_label = self._find_folder_label(element) if folder_label: - folder = self._extract_folder(folder_label) + folder = self._extract_folder(folder_label, url) if folder not in items: items.add(folder) folder.explain() else: - file = self._extract_file(element) + file = self._extract_file(element, url) items.add(file) log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items - def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: files: List[KitIpdFile] = [] name = folder_tag.getText().strip() container: Tag = folder_tag.findNextSibling(name="table") for link in self._find_file_links(container): - files.append(self._extract_file(link)) + files.append(self._extract_file(link, url)) return KitIpdFolder(name, files) @@ -138,16 +135,16 @@ class KitIpdCrawler(HttpCrawler): return None return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) - def _extract_file(self, link: Tag) -> KitIpdFile: - url = self._abs_url_from_link(link) + def _extract_file(self, link: Tag, url: str) -> KitIpdFile: + url = self._abs_url_from_link(url, link) name = os.path.basename(url) return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: return tag.findAll(name="a", attrs={"href": self._file_regex}) - def _abs_url_from_link(self, link_tag: Tag) -> str: - return urljoin(self._url, link_tag.get("href")) + def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: + return urljoin(url, link_tag.get("href")) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: @@ -162,7 +159,7 @@ class KitIpdCrawler(HttpCrawler): sink.done() - async def get_page(self) -> BeautifulSoup: + async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some # weird comments that beautifulsoup doesn't parse correctly. This @@ -170,4 +167,4 @@ class KitIpdCrawler(HttpCrawler): # cause issues on other pages. content = (await request.read()).decode("utf-8") content = re.sub(r"", "", content) - return soupify(content.encode("utf-8")) + return soupify(content.encode("utf-8")), str(request.url) From 55a2de6b88bbd2ee0cb031271e7045f53caa1702 Mon Sep 17 00:00:00 2001 From: c0derMo Date: Fri, 25 Nov 2022 10:25:22 +0000 Subject: [PATCH 408/524] Fix crawling English opencast --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbefd4..1dc5abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Fixed - IPD crawler unnecessarily appending trailing slashes +- Crawling opencast when ILIAS is set to English ## 3.4.2 - 2022-10-26 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 56dcf7b..c0ebdc9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -366,7 +366,7 @@ class IliasPage: """ # Video start links are marked with an "Abspielen" link video_links: List[Tag] = self._soup.findAll( - name="a", text=re.compile(r"\s*Abspielen\s*") + name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") ) results: List[IliasPageElement] = [] From 6d44aac2783c69031e7686263fc0a2285912376f Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 29 Nov 2022 18:22:19 +0100 Subject: [PATCH 409/524] Bump version to 3.4.3 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dc5abc..8793d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.3 - 2022-11-29 + ### Added - Missing documentation for `forums` option diff --git a/PFERD/version.py b/PFERD/version.py index 0ef5d89..7043d78 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.2" +VERSION = "3.4.3" From 722d2eb393913e770aff17da6b5b3b6603d1ee67 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 25 Nov 2022 12:49:36 +0100 Subject: [PATCH 410/524] Fix crawling of courses with preselected timeline tab --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8793d43..b1d18cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Crawling of courses with the timeline view as the default tab + ## 3.4.3 - 2022-11-29 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index c0ebdc9..44e44d9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -158,6 +158,8 @@ class IliasPage: if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() + if not self._is_content_tab_selected(): + return self._select_content_page_url() return None def _is_forum_page(self) -> bool: @@ -220,6 +222,27 @@ class IliasPage: link = self._abs_url_from_link(element) return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _is_content_tab_selected(self) -> bool: + return self._select_content_page_url() is None + + def _select_content_page_url(self) -> Optional[IliasPageElement]: + tab = self._soup.find( + id="tab_view_content", + attrs={"class": lambda x: x is not None and "active" not in x} + ) + # Already selected (or not found) + if not tab: + return None + link = tab.find("a") + if link: + link = self._abs_url_from_link(link) + return IliasPageElement(IliasElementType.FOLDER, link, "select content page") + + _unexpected_html_warning() + log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") + log.warn_contd("PFERD might not find content on the course's main page.") + return None + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere From 467fc526e8411d4a5113dbb78747aa119981c476 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:24 +0100 Subject: [PATCH 411/524] Fix crawling of file/video cards --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d18cd..c27059b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab +- Crawling of file and custom opencast cards ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 44e44d9..079cfd6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -738,7 +738,7 @@ class IliasPage: icon: Tag = card_root.select_one(".il-card-repository-head .icon") - if "opencast" in icon["class"]: + if "opencast" in icon["class"] or "xoct" in icon["class"]: return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE @@ -758,6 +758,8 @@ class IliasPage: return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY + if "file" in icon["class"]: + return IliasElementType.FILE _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") From 6f30c6583d6512c92042c581e86027a4341ddc89 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:33 +0100 Subject: [PATCH 412/524] Fix crawling of cards without descriptions --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c27059b..7a5f654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards +- Crawling of button cards without descriptions ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 079cfd6..efe6757 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -708,7 +708,11 @@ class IliasPage: "div", attrs={"class": lambda x: x and "caption" in x}, ) - description = caption_parent.find_next_sibling("div").getText().strip() + caption_container = caption_parent.find_next_sibling("div") + if caption_container: + description = caption_container.getText().strip() + else: + description = None if not type: _unexpected_html_warning() From 0294ceb7d5ff074dcc2566872d6b5f64f99c598f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 22 Mar 2023 00:08:19 +0100 Subject: [PATCH 413/524] Update github action versions --- .github/workflows/build-and-release.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 090ac7e..83a36e4 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -17,9 +17,9 @@ jobs: python: ["3.9"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -45,7 +45,7 @@ jobs: run: mv dist/pferd* dist/pferd-${{ matrix.os }} - name: Upload binary - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Binaries path: dist/pferd-${{ matrix.os }} @@ -57,7 +57,7 @@ jobs: steps: - name: Download binaries - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: Binaries From 443f7fe83913bcb82a42d7b70d4d05df65f05278 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" Date: Sat, 29 Jul 2023 17:54:42 +0200 Subject: [PATCH 414/524] Add `no-delete-prompt-overwrite` crawler conflict resolution option (#75) --- CHANGELOG.md | 3 +++ CONFIG.md | 2 ++ LICENSE | 3 ++- PFERD/output_dir.py | 11 ++++++----- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a5f654..22522e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ ambiguous situations. - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +### Added +- `no-delete-prompt-override` conflict resolution strategy + ## 3.4.3 - 2022-11-29 ### Added diff --git a/CONFIG.md b/CONFIG.md index 640e4af..84ee885 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -75,6 +75,8 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the + remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/LICENSE b/LICENSE index fe2293f..d81e827 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, - TheChristophe, Scriptim, thelukasprobst, Toorero + TheChristophe, Scriptim, thelukasprobst, Toorero, + Mr-Pine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c92f4a6..38d1288 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -44,6 +44,7 @@ class OnConflict(Enum): LOCAL_FIRST = "local-first" REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" @staticmethod def from_string(string: str) -> "OnConflict": @@ -51,7 +52,7 @@ class OnConflict(Enum): return OnConflict(string) except ValueError: raise ValueError("must be one of 'prompt', 'local-first'," - " 'remote-first', 'no-delete'") + " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") @dataclass @@ -264,7 +265,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Replace {fmt_path(path)} with remote file?" return await prompt_yes_no(prompt, default=False) @@ -283,7 +284,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" return await prompt_yes_no(prompt, default=False) @@ -303,7 +304,7 @@ class OutputDirectory: path: PurePath, parent: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" return await prompt_yes_no(prompt, default=False) @@ -330,7 +331,7 @@ class OutputDirectory: return False elif on_conflict == OnConflict.REMOTE_FIRST: return True - elif on_conflict == OnConflict.NO_DELETE: + elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: return False # This should never be reached From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 2 Jun 2023 18:19:39 +0200 Subject: [PATCH 415/524] Detect unexpected root page redirects and abort operation --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 10 ++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22522e2..ee55659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ ambiguous situations. - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +- Abort crawling when encountering an unexpected ilias root page redirect ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index efe6757..aed2069 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -79,6 +79,16 @@ class IliasPage: self._page_type = source_element.type if source_element else None self._source_name = source_element.name if source_element else "" + @staticmethod + def is_root_page(soup: BeautifulSoup) -> bool: + permalink = soup.find(id="current_perma_link") + if permalink is None: + return False + value = permalink.attrs.get("value") + if value is None: + return False + return "goto.php?target=root_" in value + def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index e3719b8..ae49edc 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ instance's greatest bottleneck. # Duplicated code, but the root page is special - we want to avoid fetching it twice! while next_stage_url: - soup = await self._get_page(next_stage_url) + soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") @@ -739,12 +739,12 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() - async def _get_page(self, url: str) -> BeautifulSoup: + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that await self.authenticate(auth_id) @@ -753,9 +753,21 @@ instance's greatest bottleneck. async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) raise CrawlError("get_page failed even after authenticating") + def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + if IliasPage.is_root_page(soup) and not root_page_allowed: + raise CrawlError( + "Unexpectedly encountered ILIAS root page. " + "This usually happens because the ILIAS instance is broken. " + "If so, wait a day or two and try again. " + "It could also happen because a crawled element links to the ILIAS root page. " + "If so, use a transform with a ! as target to ignore the particular element. " + f"The redirect came from {url}" + ) + return soup + async def _post_authenticated( self, url: str, From 123a57beec37090310f76df3746e6ce107ceb299 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 18:14:57 +0200 Subject: [PATCH 416/524] Fix mypy unreachable error in file_templates --- PFERD/crawl/ilias/file_templates.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 151a41b..59123a2 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -102,24 +102,24 @@ class Links(Enum): INTERNET_SHORTCUT = "internet-shortcut" def template(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return _link_template_fancy - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return _link_template_plain - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return _link_template_internet_shortcut - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") def extension(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return ".html" - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return ".txt" - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return ".url" - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") From 68c398f1fea5cfefd86d11e79f2f6582d50e6563 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 23:23:10 +0200 Subject: [PATCH 417/524] Add support for ILIAS learning modules --- CHANGELOG.md | 1 + PFERD/crawl/ilias/file_templates.py | 69 +++++++++ PFERD/crawl/ilias/ilias_html_cleaner.py | 2 +- PFERD/crawl/ilias/kit_ilias_html.py | 46 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 160 ++++++++++++++++++++- 5 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee55659..6e3925c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy +- support for ILIAS learning modules ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 59123a2..b206461 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,6 +1,10 @@ from enum import Enum from typing import Optional +import bs4 + +from PFERD.utils import soupify + _link_template_plain = "{{link}}" _link_template_fancy = """ @@ -94,6 +98,71 @@ _link_template_internet_shortcut = """ URL={{link}} """.strip() +_learning_module_template = """ + + + + + {{name}} + + + + +{{body}} + + +""" + + +def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: + # Seems to be comments, ignore those. + for elem in body.select(".il-copg-mob-fullscreen-modal"): + elem.decompose() + + nav_template = """ + + """ + if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): + text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + left = f'{text}' + else: + left = "" + + if next and body.select_one(".ilc_page_rnav_RightNavigation"): + text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + right = f'{text}' + else: + right = "" + + if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): + top_nav.replace_with( + soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) + ) + + if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): + bot_nav.replace_with(soupify(nav_template.replace( + "{{left}}", left).replace("{{right}}", right).encode()) + ) + + body = body.prettify() + return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + class Links(Enum): IGNORE = "ignore" diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5952309..5495304 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: dummy.decompose() if len(children) > 1: continue - if type(children[0]) == Comment: + if isinstance(type(children[0]), Comment): dummy.decompose() for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aed2069..46a8073 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" @@ -71,6 +72,14 @@ class IliasForumThread: mtime: Optional[datetime] +@dataclass +class IliasLearningModulePage: + title: str + content: Tag + next_url: Optional[str] + previous_url: Optional[str] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -136,6 +145,34 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: + if not self._is_learning_module_page(): + return None + content = self._soup.select_one("#ilLMPageContent") + title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + return IliasLearningModulePage( + title=title, + content=content, + next_url=self._find_learning_module_next(), + previous_url=self._find_learning_module_prev() + ) + + def _find_learning_module_next(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + + def _find_learning_module_prev(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) if not form: @@ -222,6 +259,12 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _is_learning_module_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=pg_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -812,6 +855,9 @@ class IliasPage: if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST + if "baseClass=ilLMPresentationGUI" in parsed_url.query: + return IliasElementType.LEARNING_MODULE + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae49edc..f82d684 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,8 +1,11 @@ import asyncio +import base64 +import os import re from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast +from urllib.parse import urljoin import aiohttp import yarl @@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links +from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, - _sanitize_path_name, parse_ilias_forum_export) +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -394,6 +397,8 @@ instance's greatest bottleneck. "[bright_black](surveys contain no relevant data)" ) return None + elif element.type == IliasElementType.LEARNING_MODULE: + return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: @@ -739,6 +744,135 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() + async def _handle_learning_module( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_learning_module(element, maybe_cl) + + @_iorepeat(3, "crawling learning module") + @anoncritical + async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements: List[IliasLearningModulePage] = [] + + async with cl: + log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {element.url}") + soup = await self._get_page(element.url) + page = IliasPage(soup, element.url, None) + if next := page.get_learning_module_data(): + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.previous_url, "left" + )) + elements.append(next) + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.next_url, "right" + )) + + # Reflect their natural ordering in the file names + for index, lm_element in enumerate(elements): + lm_element.title = f"{index:02}_{lm_element.title}" + + tasks: List[Awaitable[None]] = [] + for index, elem in enumerate(elements): + prev_url = elements[index - 1].title if index > 0 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None + tasks.append(asyncio.create_task( + self._download_learning_module_page(cl.path, elem, prev_url, next_url) + )) + + # And execute them + await self.gather(tasks) + + async def _crawl_learning_module_direction( + self, + path: PurePath, + start_url: Optional[str], + dir: Union[Literal["left"], Literal["right"]] + ) -> List[IliasLearningModulePage]: + elements: List[IliasLearningModulePage] = [] + + if not start_url: + return elements + + next_element_url: Optional[str] = start_url + counter = 0 + while next_element_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") + log.explain(f"URL: {next_element_url}") + soup = await self._get_page(next_element_url) + page = IliasPage(soup, next_element_url, None) + if next := page.get_learning_module_data(): + elements.append(next) + if dir == "left": + next_element_url = next.previous_url + else: + next_element_url = next.next_url + counter += 1 + + return elements + + @anoncritical + @_iorepeat(3, "saving learning module page") + async def _download_learning_module_page( + self, + parent_path: PurePath, + element: IliasLearningModulePage, + prev: Optional[str], + next: Optional[str] + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path) + if not maybe_dl: + return + my_path = self._transformer.transform(maybe_dl.path) + if not my_path: + return + + if prev: + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + if prev_p: + prev = os.path.relpath(prev_p, my_path.parent) + else: + prev = None + if next: + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + if next_p: + next = os.path.relpath(next_p, my_path.parent) + else: + next = None + + async with maybe_dl as (bar, sink): + content = element.content + content = await self.internalize_images(content) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.done() + + async def internalize_images(self, tag: Tag) -> Tag: + """ + Tries to fetch ILIAS images and embed them as base64 data. + """ + log.explain_topic("Internalizing images") + for elem in tag.find_all(recursive=True): + if not isinstance(elem, Tag): + continue + if elem.name == "img": + if src := elem.attrs.get("src", None): + url = urljoin(_ILIAS_URL, src) + if not url.startswith(_ILIAS_URL): + continue + log.explain(f"Internalizing {url!r}") + img = await self._get_authenticated(url) + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() + if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + # For unknown reasons the protocol seems to be stripped. + elem.attrs["src"] = "https:" + elem.attrs["src"] + return tag + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -772,7 +906,7 @@ instance's greatest bottleneck. self, url: str, data: dict[str, Union[str, List[str]]] - ) -> BeautifulSoup: + ) -> bytes: auth_id = await self._current_auth_id() form_data = aiohttp.FormData() @@ -792,6 +926,22 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("post_authenticated failed even after authenticating") + async def _get_authenticated(self, url: str) -> bytes: + auth_id = await self._current_auth_id() + + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("get_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. @ _iorepeat(3, "Login", failure_is_error=True) From dbc2553b119c39c7a8ad196c6858fc8109f746a9 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" <50425705+Mr-Pine@users.noreply.github.com> Date: Wed, 15 Mar 2023 15:33:42 +0100 Subject: [PATCH 418/524] Add default `show-not-deleted` option If set to `no`, PFERD won't print status or report messages for not deleted files --- CHANGELOG.md | 3 +++ CONFIG.md | 8 ++++++-- PFERD/__main__.py | 4 ++++ PFERD/cli/parser.py | 7 +++++++ PFERD/config.py | 3 +++ PFERD/logging.py | 20 ++++++++++++++++++++ PFERD/output_dir.py | 2 +- PFERD/pferd.py | 2 +- 8 files changed, 45 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e3925c..85513d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,9 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy - support for ILIAS learning modules +- `show_not_deleted` option to stop printing the "Not Deleted" status or report + message. This combines nicely with the `no-delete-prompt-override` strategy, + causing PFERD to mostly ignore local-only files. ## 3.4.3 - 2022-11-29 diff --git a/CONFIG.md b/CONFIG.md index 84ee885..5f62749 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -26,6 +26,9 @@ default values for the other sections. `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `show_not_deleted`: Whether PFERD should print messages in status and report + when a local-only file wasn't deleted. Combines nicely with the + `no-delete-prompt-override` conflict resolution strategy. - `share_cookies`: Whether crawlers should share cookies where applicable. For example, some crawlers share cookies if they crawl the same website using the same account. (Default: `yes`) @@ -75,8 +78,9 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. - - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the - remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to + overwrite local files if the remote file is different. Combines nicely + with the `show_not_deleted` option. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 4faeb13..cb8c67c 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -47,6 +47,8 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: log.output_explain = args.explain if args.status is not None: log.output_status = args.status + if args.show_not_deleted is not None: + log.output_not_deleted = args.show_not_deleted if args.report is not None: log.output_report = args.report @@ -72,6 +74,8 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N log.output_status = config.default_section.status() if args.report is None: log.output_report = config.default_section.report() + if args.show_not_deleted is None: + log.output_not_deleted = config.default_section.show_not_deleted() except ConfigOptionError as e: log.error(str(e)) sys.exit(1) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index e753023..be483fd 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -215,6 +215,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="whether crawlers should share cookies where applicable" ) +PARSER.add_argument( + "--show-not-deleted", + action=BooleanOptionalAction, + help="print messages in status and report when PFERD did not delete a local only file" +) def load_default_section( @@ -233,6 +238,8 @@ def load_default_section( section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" + if args.show_not_deleted is not None: + section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 8f7e682..b2cff4e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -82,6 +82,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def show_not_deleted(self) -> bool: + return self.s.getboolean("show_not_deleted", fallback=True) + def share_cookies(self) -> bool: return self.s.getboolean("share_cookies", fallback=True) diff --git a/PFERD/logging.py b/PFERD/logging.py index 340b21f..b958fb2 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -59,6 +59,7 @@ class Log: # Whether different parts of the output are enabled or disabled self.output_explain = False self.output_status = True + self.output_not_deleted = True self.output_report = True def _update_live(self) -> None: @@ -207,6 +208,17 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new action = escape(f"{action:<{self.STATUS_WIDTH}}") self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: + """ + Print a message for a local only file that wasn't + deleted while crawling. Allows markup in the "style" + argument which will be applied to the "action" string. + """ + + if self.output_status and self.output_not_deleted: + action = escape(f"{action:<{self.STATUS_WIDTH}}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def report(self, text: str) -> None: """ Print a report after crawling. Allows markup. @@ -215,6 +227,14 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_report: self.print(text) + def report_not_deleted(self, text: str) -> None: + """ + Print a report for a local only file that wasn't deleted after crawling. Allows markup. + """ + + if self.output_report and self.output_not_deleted: + self.print(text) + @contextmanager def _bar( self, diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 38d1288..e9e9b93 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -496,7 +496,7 @@ class OutputDirectory: except OSError: pass else: - log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) + log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) self._report.not_delete_file(pure) def load_prev_report(self) -> None: diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 079053b..b30a04a 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -180,7 +180,7 @@ class Pferd: log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") for path in sorted(crawler.report.not_deleted_files): something_changed = True - log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + log.report_not_deleted(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") for warning in crawler.report.encountered_warnings: something_changed = True From b3d412360baeed6992535e6957d0bc1e368c337f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 23:48:14 +0200 Subject: [PATCH 419/524] Add Nix flake --- flake.lock | 27 +++++++++++++++++++++++++++ flake.nix | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..914c58b --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1692986144, + "narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-23.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..e3d52af --- /dev/null +++ b/flake.nix @@ -0,0 +1,41 @@ +{ + description = "Tool for downloading course-related files from ILIAS"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; + }; + + outputs = { self, nixpkgs }: + let + # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. + forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; + in + { + packages = forAllSystems (system: + let pkgs = import nixpkgs { inherit system; }; + in + rec { + default = pkgs.python3Packages.buildPythonApplication rec { + pname = "pferd"; + # Performing black magic + # Don't worry, I sacrificed enough goats for the next few years + version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; + format = "pyproject"; + + src = ./.; + + nativeBuildInputs = with pkgs.python3Packages; [ + setuptools + ]; + + propagatedBuildInputs = with pkgs.python3Packages; [ + aiohttp + beautifulsoup4 + rich + keyring + certifi + ]; + }; + }); + }; +} From 2184ac804018e836e439e365ae2b0d184adae26d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 19:39:40 +0200 Subject: [PATCH 420/524] Add support for ILIAS mediacast listings --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 110 +++++++++++++++------ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 45 +++++---- 3 files changed, 107 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85513d2..d58ea18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. - `show_not_deleted` option to stop printing the "Not Deleted" status or report message. This combines nicely with the `no-delete-prompt-override` strategy, causing PFERD to mostly ignore local-only files. +- support for mediacast video listings ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 46a8073..d5ea76d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -26,10 +26,12 @@ class IliasElementType(Enum): BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" - VIDEO = "video" - VIDEO_PLAYER = "video_player" - VIDEO_FOLDER = "video_folder" - VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + MEDIACAST_VIDEO = "mediacast_video" + OPENCAST_VIDEO = "opencast_video" + OPENCAST_VIDEO_PLAYER = "opencast_video_player" + OPENCAST_VIDEO_FOLDER = "opencast_video_folder" + OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @dataclass @@ -45,7 +47,8 @@ class IliasPageElement: r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)", - r"target=[a-z]+_(?P\d+)" + r"target=[a-z]+_(?P\d+)", + r"mm_(?P\d+)" ] for regex in regexes: @@ -105,9 +108,9 @@ class IliasPage: if self._is_video_player(): log.explain("Page is a video player, extracting URL") return self._player_to_video() - if self._is_video_listing(): - log.explain("Page is a video listing, searching for elements") - return self._find_video_entries() + if self._is_opencast_video_listing(): + log.explain("Page is an opencast video listing, searching for elements") + return self._find_opencast_video_entries() if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() @@ -199,9 +202,9 @@ class IliasPage: if self._is_ilias_opencast_embedding(): log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] - if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED: log.explain("Unwrapping video pagination") - return self._find_video_entries_paginated()[0] + return self._find_opencast_video_entries_paginated()[0] if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() @@ -219,7 +222,7 @@ class IliasPage: def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) - def _is_video_listing(self) -> bool: + def _is_opencast_video_listing(self) -> bool: if self._is_ilias_opencast_embedding(): return True @@ -319,14 +322,14 @@ class IliasPage: # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] - items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) return items @@ -385,7 +388,7 @@ class IliasPage: return items - def _find_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -405,27 +408,27 @@ class IliasPage: query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) log.explain("Found ILIAS video frame page, fetching actual content next") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None - if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER: # We are in stage 2 - try to break pagination - return self._find_video_entries_paginated() + return self._find_opencast_video_entries_paginated() - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() - def _find_video_entries_paginated(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() table_id = id_match.group(1) @@ -434,9 +437,9 @@ class IliasPage: url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ @@ -448,11 +451,11 @@ class IliasPage: results: List[IliasPageElement] = [] for link in video_links: - results.append(self._listed_video_to_element(link)) + results.append(self._listed_opencast_video_to_element(link)) return results - def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date @@ -479,7 +482,9 @@ class IliasPage: video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") - return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + return IliasPageElement( + IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time + ) def _find_exercise_entries(self) -> List[IliasPageElement]: if self._soup.find(id="tab_submission"): @@ -622,9 +627,48 @@ class IliasPage: result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) result += self._find_cards() + result += self._find_mediacast_videos() return result + def _find_mediacast_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + element_name = _sanitize_path_name( + elem.select_one(".ilPlayerPreviewDescription").getText().strip() + ) + if not element_name.endswith(".mp4"): + # just to make sure it has some kinda-alrightish ending + element_name = element_name + ".mp4" + video_element = elem.find(name="video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No