From 1fc8e9eb7ad99ad8c950c76398aab64b05c7d801 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 10:00:59 +0000 Subject: [PATCH 001/224] Document credential file authenticator config options --- CONFIG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 7826b04..feeade3 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -182,8 +182,11 @@ via the terminal. ### The `credential-file` authenticator -This authenticator reads a username and a password from a credential file. The -credential file has exactly two lines (trailing newline optional). The first +This authenticator reads a username and a password from a credential file. + +- `path`: Path to the credential file. (Required) + +The credential file has exactly two lines (trailing newline optional). The first line starts with `username=` and contains the username, the second line starts with `password=` and contains the password. The username and password may contain any characters except a line break. From 31b6311e993439b2bbb087511ca012e140003d9e Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Jun 2021 19:02:55 +0200 Subject: [PATCH 002/224] Remove incorrect tmp file explain message --- PFERD/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 5ae62bb..b274b6b 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -147,7 +147,6 @@ def main() -> None: log.unlock() log.explain_topic("Interrupted, exiting immediately") log.explain("Open files and connections are left for the OS to clean up") - log.explain("Temporary files are not cleaned up") pferd.print_report() # TODO Clean up tmp files # And when those files *do* actually get cleaned up properly, From fc31100a0f6e1933cf084e46898ad20d33d892b9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 4 Jun 2021 18:02:45 +0200 Subject: [PATCH 003/224] Always use '/' as path separator for regex rules Previously, regex-matching paths on windows would, in some cases, require four backslashes ('\\\\') to escape a single path separator. That's just too much. With this commit, regex transforms now use '/' instead of '\' as path separator, meaning rules can more easily be shared between platforms (although they are not guaranteed to be 100% compatible since on Windows, '\' is still recognized as a path separator). To make rules more intuitive to write, local relative paths are now also printed with '/' as path separator on Windows. Since Windows also accepts '/' as path separator, this change doesn't really affect other rules that parse their sides as paths. --- CHANGELOG.md | 3 +++ PFERD/transformer.py | 4 ++-- PFERD/utils.py | 8 +++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87c1d05..980f96e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Use `/` instead of `\` as path separator for (regex) rules on Windows + ## 3.0.1 - 2021-06-01 ### Added diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 83ffde4..ed123eb 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -10,7 +10,7 @@ from pathlib import PurePath from typing import Dict, Optional, Sequence, Union from .logging import log -from .utils import fmt_path +from .utils import fmt_path, str_path class Rule(ABC): @@ -116,7 +116,7 @@ class ReRule(Rule): self._right = right def transform(self, path: PurePath) -> Union[PurePath, bool]: - if match := re.fullmatch(self._left, str(path)): + if match := re.fullmatch(self._left, str_path(path)): if isinstance(self._right, bool): return self._right or path diff --git a/PFERD/utils.py b/PFERD/utils.py index 397feda..7c7b6f4 100644 --- a/PFERD/utils.py +++ b/PFERD/utils.py @@ -91,8 +91,14 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str: return result +def str_path(path: PurePath) -> str: + if not path.parts: + return "." + return "/".join(path.parts) + + def fmt_path(path: PurePath) -> str: - return repr(str(path)) + return repr(str_path(path)) def fmt_real_path(path: Path) -> str: From df3ad3d890e0c7e21fbb68305f3c1016f58c2523 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 4 Jun 2021 18:33:02 +0200 Subject: [PATCH 004/224] Add 'skip' option to crawlers --- CHANGELOG.md | 3 +++ CONFIG.md | 3 +++ PFERD/auth/authenticator.py | 6 +++++- PFERD/crawl/__init__.py | 2 +- PFERD/crawl/crawler.py | 9 +++++++++ PFERD/pferd.py | 39 +++++++++++++++++++++++++------------ 6 files changed, 48 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 980f96e..32cbe77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- `skip` option for crawlers + ### Changed - Use `/` instead of `\` as path separator for (regex) rules on Windows diff --git a/CONFIG.md b/CONFIG.md index feeade3..2f18be1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -49,6 +49,9 @@ see the type's [documentation](#crawler-types) below. The following options are common to all crawlers: - `type`: The available types are specified in [this section](#crawler-types). +- `skip`: Whether the crawler should be skipped during normal execution. The + crawler can still be executed manually using the `--crawler` or `-C` flags. + (Default: `no`) - `output_dir`: The directory the crawler synchronizes files to. A crawler will never place any files outside of this directory. (Default: the crawler's name) - `redownload`: When to download a file that is already present locally. diff --git a/PFERD/auth/authenticator.py b/PFERD/auth/authenticator.py index f588bc4..643a2d5 100644 --- a/PFERD/auth/authenticator.py +++ b/PFERD/auth/authenticator.py @@ -13,7 +13,11 @@ class AuthError(Exception): class AuthSection(Section): - pass + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value class Authenticator(ABC): diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 297c490..7eb2fb1 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -3,7 +3,7 @@ from typing import Callable, Dict from ..auth import Authenticator from ..config import Config -from .crawler import Crawler, CrawlError # noqa: F401 +from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index e990f16..d61783f 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -132,6 +132,15 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]): class CrawlerSection(Section): + def type(self) -> str: + value = self.s.get("type") + if value is None: + self.missing_value("type") + return value + + def skip(self) -> bool: + return self.s.getboolean("skip", fallback=False) + def output_dir(self, name: str) -> Path: # TODO Use removeprefix() after switching to 3.9 if name.startswith("crawl:"): diff --git a/PFERD/pferd.py b/PFERD/pferd.py index ac373cf..d98b426 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -3,9 +3,9 @@ from typing import Dict, List, Optional from rich.markup import escape -from .auth import AUTHENTICATORS, Authenticator, AuthError +from .auth import AUTHENTICATORS, Authenticator, AuthError, AuthSection from .config import Config, ConfigOptionError -from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler +from .crawl import CRAWLERS, Crawler, CrawlError, CrawlerSection, KitIliasWebCrawler from .logging import log from .utils import fmt_path @@ -26,19 +26,22 @@ class Pferd: self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} - def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: - log.explain_topic("Deciding which crawlers to run") - crawl_sections = [name for name, _ in config.crawl_sections()] + def _find_config_crawlers(self, config: Config) -> List[str]: + crawl_sections = [] - if cli_crawlers is None: - log.explain("No crawlers specified on CLI") - log.explain("Running all crawlers specified in config") - return crawl_sections + for name, section in config.crawl_sections(): + if CrawlerSection(section).skip(): + log.explain(f"Skipping {name!r}") + else: + crawl_sections.append(name) + return crawl_sections + + def _find_cli_crawlers(self, config: Config, cli_crawlers: List[str]) -> List[str]: if len(cli_crawlers) != len(set(cli_crawlers)): raise PferdLoadError("Some crawlers were selected multiple times") - log.explain("Crawlers specified on CLI") + crawl_sections = [name for name, _ in config.crawl_sections()] crawlers_to_run = [] # With crawl: prefix unknown_names = [] # Without crawl: prefix @@ -62,10 +65,22 @@ class Pferd: return crawlers_to_run + def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + log.explain_topic("Deciding which crawlers to run") + + if cli_crawlers is None: + log.explain("No crawlers specified on CLI") + log.explain("Running crawlers specified in config") + return self._find_config_crawlers(config) + else: + log.explain("Crawlers specified on CLI") + return self._find_cli_crawlers(config, cli_crawlers) + def _load_authenticators(self) -> None: for name, section in self._config.auth_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - auth_type = section.get("type") + + auth_type = AuthSection(section).type() authenticator_constructor = AUTHENTICATORS.get(auth_type) if authenticator_constructor is None: raise ConfigOptionError(name, "type", f"Unknown authenticator type: {auth_type!r}") @@ -80,7 +95,7 @@ class Pferd: for name, section in self._config.crawl_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") - crawl_type = section.get("type") + crawl_type = CrawlerSection(section).type() crawler_constructor = CRAWLERS.get(crawl_type) if crawler_constructor is None: raise ConfigOptionError(name, "type", f"Unknown crawler type: {crawl_type!r}") From 8ab462fb87e8bdfac8bfd6821645dd9f4617e898 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 4 Jun 2021 19:23:33 +0200 Subject: [PATCH 005/224] Use the exercise label instead of the button name as path --- CHANGELOG.md | 2 ++ PFERD/crawl/ilias/kit_ilias_html.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32cbe77..171a61c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ ambiguous situations. ### Changed - Use `/` instead of `\` as path separator for (regex) rules on Windows +- Use the label to the left for exercises instead of the button name to + determine the folder name ## 3.0.1 - 2021-06-01 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 64491f9..db9a303 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -293,7 +293,13 @@ class IliasPage: # Add each listing as a new for listing in file_listings: - file_name = _sanitize_path_name(listing.getText().strip()) + parent_container: Tag = listing.findParent( + "div", attrs={"class": lambda x: x and "form-group" in x} + ) + label_container: Tag = parent_container.find( + attrs={"class": lambda x: x and "control-label" in x} + ) + file_name = _sanitize_path_name(label_container.getText().strip()) url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") results.append(IliasPageElement( From 61d902d7153f2942e24f92bd9e0a35e39be05563 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 17:42:38 +0200 Subject: [PATCH 006/224] Overhaul transform logic -re-> arrows now rename their parent directories (like -->) and don't require a full match (like -exact->). Their old behaviour is available as -exact-re->. Also, this change adds the ">>" arrow head, which modifies the current path and continues to the next rule when it matches. --- CHANGELOG.md | 3 + PFERD/transformer.py | 540 +++++++++++++++++++++++-------------------- 2 files changed, 298 insertions(+), 245 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 171a61c..ffc6e81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,8 +24,11 @@ ambiguous situations. ### Added - `skip` option for crawlers +- Rules with `>>` instead of `>` as arrow head +- `-exact-re->` arrow (behaves like `-re->` did previously) ### Changed +- The `-re->` arrow can now rename directories (like `-->`) - Use `/` instead of `\` as path separator for (regex) rules on Windows - Use the label to the left for exercises instead of the button name to determine the folder name diff --git a/PFERD/transformer.py b/PFERD/transformer.py index ed123eb..bf51d6a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -1,151 +1,159 @@ -# I'm sorry that this code has become a bit dense and unreadable. While -# reading, it is important to remember what True and False mean. I'd love to -# have some proper sum-types for the inputs and outputs, they'd make this code -# a lot easier to understand. - import ast import re from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum from pathlib import PurePath -from typing import Dict, Optional, Sequence, Union +from typing import Callable, Dict, List, Optional, Sequence, TypeVar, Union from .logging import log from .utils import fmt_path, str_path -class Rule(ABC): - @abstractmethod - def transform(self, path: PurePath) -> Union[PurePath, bool]: - """ - Try to apply this rule to the path. Returns another path if the rule - was successfully applied, True if the rule matched but resulted in an - exclamation mark, and False if the rule didn't match at all. - """ +class ArrowHead(Enum): + NORMAL = 0 + SEQUENCE = 1 + +class Ignore: + pass + + +class Empty: + pass + + +RightSide = Union[str, Ignore, Empty] + + +@dataclass +class Transformed: + path: PurePath + + +class Ignored: + pass + + +TransformResult = Optional[Union[Transformed, Ignored]] + + +@dataclass +class Rule: + left: str + name: str + head: ArrowHead + right: RightSide + + def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: + if isinstance(self.right, str): + return self.right + elif isinstance(self.right, Ignore): + return Ignored() + elif isinstance(self.right, Empty): + return Transformed(path) + else: + raise RuntimeError(f"Right side has invalid type {type(self.right)}") + + +class Transformation(ABC): + def __init__(self, rule: Rule): + self.rule = rule + + @abstractmethod + def transform(self, path: PurePath) -> TransformResult: pass -# These rules all use a Union[T, bool] for their right side. They are passed a -# T if the arrow's right side was a normal string, True if it was an -# exclamation mark and False if it was missing entirely. - -class NormalRule(Rule): - def __init__(self, left: PurePath, right: Union[PurePath, bool]): - - self._left = left - self._right = right - - def _match_prefix(self, path: PurePath) -> Optional[PurePath]: - left_parts = list(reversed(self._left.parts)) - path_parts = list(reversed(path.parts)) - - if len(left_parts) > len(path_parts): +class ExactTf(Transformation): + def transform(self, path: PurePath) -> TransformResult: + if path != PurePath(self.rule.left): return None - while left_parts and path_parts: - left_part = left_parts.pop() - path_part = path_parts.pop() + right = self.rule.right_result(path) + if not isinstance(right, str): + return right - if left_part != path_part: - return None + return Transformed(PurePath(right)) - if left_parts: + +class ExactReTf(Transformation): + def transform(self, path: PurePath) -> TransformResult: + match = re.fullmatch(self.rule.left, str_path(path)) + if not match: return None - path_parts.reverse() - return PurePath(*path_parts) + right = self.rule.right_result(path) + if not isinstance(right, str): + return right - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if rest := self._match_prefix(path): - if isinstance(self._right, bool): - return self._right or path + # For some reason, mypy thinks that "groups" has type List[str]. But + # since elements of "match.groups()" can be None, mypy is wrong. + groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) + + locals_dir: Dict[str, Union[str, int, float]] = {} + for i, group in enumerate(groups): + if group is None: + continue + + locals_dir[f"g{i}"] = group + + try: + locals_dir[f"i{i}"] = int(group) + except ValueError: + pass + + try: + locals_dir[f"f{i}"] = float(group) + except ValueError: + pass + + result = eval(f"f{right!r}", {}, locals_dir) + return Transformed(PurePath(result)) + + +class RenamingParentsTf(Transformation): + def __init__(self, sub_tf: Transformation): + super().__init__(sub_tf.rule) + self.sub_tf = sub_tf + + def transform(self, path: PurePath) -> TransformResult: + for i in range(len(path.parts), -1, -1): + parent = PurePath(*path.parts[:i]) + child = PurePath(*path.parts[i:]) + + transformed = self.sub_tf.transform(parent) + if not transformed: + continue + elif isinstance(transformed, Transformed): + return Transformed(transformed.path / child) + elif isinstance(transformed, Ignored): + return transformed else: - return self._right / rest + raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - return False + return None -class ExactRule(Rule): - def __init__(self, left: PurePath, right: Union[PurePath, bool]): - self._left = left - self._right = right +class RenamingPartsTf(Transformation): + def __init__(self, sub_tf: Transformation): + super().__init__(sub_tf.rule) + self.sub_tf = sub_tf - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if path == self._left: - if isinstance(self._right, bool): - return self._right or path - else: - return self._right - - return False - - -class NameRule(Rule): - def __init__(self, subrule: Rule): - self._subrule = subrule - - def transform(self, path: PurePath) -> Union[PurePath, bool]: - matched = False + def transform(self, path: PurePath) -> TransformResult: result = PurePath() - for part in path.parts: - part_result = self._subrule.transform(PurePath(part)) - if isinstance(part_result, PurePath): - matched = True - result /= part_result - elif part_result: - # If any subrule call ignores its path segment, the entire path - # should be ignored - return True - else: - # The subrule doesn't modify this segment, but maybe other - # segments + transformed = self.sub_tf.transform(PurePath(part)) + if not transformed: result /= part + elif isinstance(transformed, Transformed): + result /= transformed.path + elif isinstance(transformed, Ignored): + return transformed + else: + raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - if matched: - return result - else: - # The subrule has modified no segments, so this name version of it - # doesn't match - return False - - -class ReRule(Rule): - def __init__(self, left: str, right: Union[str, bool]): - self._left = left - self._right = right - - def transform(self, path: PurePath) -> Union[PurePath, bool]: - if match := re.fullmatch(self._left, str_path(path)): - if isinstance(self._right, bool): - return self._right or path - - vars: Dict[str, Union[str, int, float]] = {} - - # For some reason, mypy thinks that "groups" has type List[str]. - # But since elements of "match.groups()" can be None, mypy is - # wrong. - groups: Sequence[Optional[str]] = [match[0]] + list(match.groups()) - for i, group in enumerate(groups): - if group is None: - continue - - vars[f"g{i}"] = group - - try: - vars[f"i{i}"] = int(group) - except ValueError: - pass - - try: - vars[f"f{i}"] = float(group) - except ValueError: - pass - - result = eval(f"f{self._right!r}", vars) - return PurePath(result) - - return False + return None class RuleParseError(Exception): @@ -162,18 +170,15 @@ class RuleParseError(Exception): log.error_contd(f"{spaces}^--- {self.reason}") +T = TypeVar("T") + + class Line: def __init__(self, line: str, line_nr: int): self._line = line self._line_nr = line_nr self._index = 0 - def get(self) -> Optional[str]: - if self._index < len(self._line): - return self._line[self._index] - - return None - @property def line(self) -> str: return self._line @@ -190,155 +195,192 @@ class Line: def index(self, index: int) -> None: self._index = index - def advance(self) -> None: - self._index += 1 + @property + def rest(self) -> str: + return self.line[self.index:] - def expect(self, string: str) -> None: - for char in string: - if self.get() == char: - self.advance() - else: - raise RuleParseError(self, f"Expected {char!r}") + def peek(self, amount: int = 1) -> str: + return self.rest[:amount] + + def take(self, amount: int = 1) -> str: + string = self.peek(amount) + self.index += len(string) + return string + + def expect(self, string: str) -> str: + if self.peek(len(string)) == string: + return self.take(len(string)) + else: + raise RuleParseError(self, f"Expected {string!r}") + + def expect_with(self, string: str, value: T) -> T: + self.expect(string) + return value + + def one_of(self, parsers: List[Callable[[], T]], description: str) -> T: + for parser in parsers: + index = self.index + try: + return parser() + except RuleParseError: + self.index = index + + raise RuleParseError(self, description) + + +# RULE = LEFT SPACE '-' NAME '-' HEAD (SPACE RIGHT)? +# SPACE = ' '+ +# NAME = '' | 'exact' | 'name' | 're' | 'exact-re' | 'name-re' +# HEAD = '>' | '>>' +# LEFT = STR | QUOTED_STR +# RIGHT = STR | QUOTED_STR | '!' + + +def parse_zero_or_more_spaces(line: Line) -> None: + while line.peek() == " ": + line.take() + + +def parse_one_or_more_spaces(line: Line) -> None: + line.expect(" ") + parse_zero_or_more_spaces(line) + + +def parse_str(line: Line) -> str: + result = [] + while c := line.peek(): + if c == " ": + break + else: + line.take() + result.append(c) + + if result: + return "".join(result) + else: + raise RuleParseError(line, "Expected non-space character") QUOTATION_MARKS = {'"', "'"} -def parse_string_literal(line: Line) -> str: +def parse_quoted_str(line: Line) -> str: escaped = False # Points to first character of string literal start_index = line.index - quotation_mark = line.get() + quotation_mark = line.peek() if quotation_mark not in QUOTATION_MARKS: - # This should never happen as long as this function is only called from - # parse_string. - raise RuleParseError(line, "Invalid quotation mark") - line.advance() + raise RuleParseError(line, "Expected quotation mark") + line.take() - while c := line.get(): + while c := line.peek(): if escaped: escaped = False - line.advance() + line.take() elif c == quotation_mark: - line.advance() + line.take() stop_index = line.index literal = line.line[start_index:stop_index] - return ast.literal_eval(literal) + try: + return ast.literal_eval(literal) + except SyntaxError as e: + line.index = start_index + raise RuleParseError(line, str(e)) from e elif c == "\\": escaped = True - line.advance() + line.take() else: - line.advance() + line.take() raise RuleParseError(line, "Expected end of string literal") -def parse_until_space_or_eol(line: Line) -> str: - result = [] - while c := line.get(): - if c == " ": - break - result.append(c) - line.advance() - - return "".join(result) - - -def parse_string(line: Line) -> Union[str, bool]: - if line.get() in QUOTATION_MARKS: - return parse_string_literal(line) +def parse_left(line: Line) -> str: + if line.peek() in QUOTATION_MARKS: + return parse_quoted_str(line) else: - string = parse_until_space_or_eol(line) + return parse_str(line) + + +def parse_right(line: Line) -> Union[str, Ignore]: + c = line.peek() + if c in QUOTATION_MARKS: + return parse_quoted_str(line) + else: + string = parse_str(line) if string == "!": - return True + return Ignore() return string -def parse_arrow(line: Line) -> str: - line.expect("-") - - name = [] - while True: - c = line.get() - if not c: - raise RuleParseError(line, "Expected rest of arrow") - elif c == "-": - line.advance() - c = line.get() - if not c: - raise RuleParseError(line, "Expected rest of arrow") - elif c == ">": - line.advance() - break # End of arrow - else: - name.append("-") - continue - else: - name.append(c) - - line.advance() - - return "".join(name) +def parse_arrow_name(line: Line) -> str: + return line.one_of([ + lambda: line.expect("exact-re"), + lambda: line.expect("exact"), + lambda: line.expect("name-re"), + lambda: line.expect("name"), + lambda: line.expect("re"), + lambda: line.expect(""), + ], "Expected arrow name") -def parse_whitespace(line: Line) -> None: - line.expect(" ") - while line.get() == " ": - line.advance() +def parse_arrow_head(line: Line) -> ArrowHead: + return line.one_of([ + lambda: line.expect_with(">>", ArrowHead.SEQUENCE), + lambda: line.expect_with(">", ArrowHead.NORMAL), + ], "Expected arrow head") def parse_eol(line: Line) -> None: - if line.get() is not None: + if line.peek(): raise RuleParseError(line, "Expected end of line") def parse_rule(line: Line) -> Rule: - # Parse left side - leftindex = line.index - left = parse_string(line) - if isinstance(left, bool): - line.index = leftindex - raise RuleParseError(line, "Left side can't be '!'") - leftpath = PurePath(left) + parse_zero_or_more_spaces(line) + left = parse_left(line) - # Parse arrow - parse_whitespace(line) - arrowindex = line.index - arrowname = parse_arrow(line) + parse_one_or_more_spaces(line) - # Parse right side - if line.get(): - parse_whitespace(line) - right = parse_string(line) + line.expect("-") + name = parse_arrow_name(line) + line.expect("-") + head = parse_arrow_head(line) + + index = line.index + right: RightSide + try: + parse_zero_or_more_spaces(line) + parse_eol(line) + right = Empty() + except RuleParseError: + line.index = index + parse_one_or_more_spaces(line) + right = parse_right(line) + parse_eol(line) + + return Rule(left, name, head, right) + + +def parse_transformation(line: Line) -> Transformation: + rule = parse_rule(line) + + if rule.name == "": + return RenamingParentsTf(ExactTf(rule)) + elif rule.name == "exact": + return ExactTf(rule) + elif rule.name == "name": + return RenamingPartsTf(ExactTf(rule)) + elif rule.name == "re": + return RenamingParentsTf(ExactReTf(rule)) + elif rule.name == "exact-re": + return ExactReTf(rule) + elif rule.name == "name-re": + return RenamingPartsTf(ExactReTf(rule)) else: - right = False - rightpath: Union[PurePath, bool] - if isinstance(right, bool): - rightpath = right - else: - rightpath = PurePath(right) - - parse_eol(line) - - # Dispatch - if arrowname == "": - return NormalRule(leftpath, rightpath) - elif arrowname == "name": - if len(leftpath.parts) > 1: - line.index = leftindex - raise RuleParseError(line, "SOURCE must be a single name, not multiple segments") - return NameRule(ExactRule(leftpath, rightpath)) - elif arrowname == "exact": - return ExactRule(leftpath, rightpath) - elif arrowname == "re": - return ReRule(left, right) - elif arrowname == "name-re": - return NameRule(ReRule(left, right)) - else: - line.index = arrowindex + 1 # For nicer error message - raise RuleParseError(line, f"Invalid arrow name {arrowname!r}") + raise RuntimeError(f"Invalid arrow name {rule.name!r}") class Transformer: @@ -347,32 +389,40 @@ class Transformer: May throw a RuleParseException. """ - self._rules = [] + self._tfs = [] for i, line in enumerate(rules.split("\n")): line = line.strip() if line: - rule = parse_rule(Line(line, i)) - self._rules.append((line, rule)) + tf = parse_transformation(Line(line, i)) + self._tfs.append((line, tf)) def transform(self, path: PurePath) -> Optional[PurePath]: - for i, (line, rule) in enumerate(self._rules): + for i, (line, tf) in enumerate(self._tfs): log.explain(f"Testing rule {i+1}: {line}") try: - result = rule.transform(path) + result = tf.transform(path) except Exception as e: log.warn(f"Error while testing rule {i+1}: {line}") log.warn_contd(str(e)) continue - if isinstance(result, PurePath): - log.explain(f"Match found, transformed path to {fmt_path(result)}") - return result - elif result: # Exclamation mark - log.explain("Match found, path ignored") - return None - else: + if not result: continue - log.explain("No rule matched, path is unchanged") + if isinstance(result, Ignored): + log.explain("Match found, path ignored") + return None + + if tf.rule.head == ArrowHead.NORMAL: + log.explain(f"Match found, transformed path to {fmt_path(result.path)}") + path = result.path + break + elif tf.rule.head == ArrowHead.SEQUENCE: + log.explain(f"Match found, updated path to {fmt_path(result.path)}") + path = result.path + else: + raise RuntimeError(f"Invalid transform result of type {type(result)}: {result}") + + log.explain(f"Final result: {fmt_path(path)}") return path From f28bbe6b0c11c165ad604b6ab33730a37800604a Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 22:22:40 +0200 Subject: [PATCH 007/224] Update transform rule documentation It's still missing an example that uses rules with ">>" arrows. --- CONFIG.md | 128 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 40 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 2f18be1..1793ddc 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -222,56 +222,87 @@ This authenticator does not support usernames. Transformation rules are rules for renaming and excluding files and directories. They are specified line-by-line in a crawler's `transform` option. When a crawler needs to apply a rule to a path, it goes through this list top-to-bottom -and choose the first matching rule. +and applies the first matching rule. To see this process in action, you can use the `--debug-transforms` or flag or the `--explain` flag. -Each line has the format `SOURCE ARROW TARGET` where `TARGET` is optional. -`SOURCE` is either a normal path without spaces (e. g. `foo/bar`), or a string -literal delimited by `"` or `'` (e. g. `"foo\" bar/baz"`). Python's string -escape syntax is supported. Trailing slashes are ignored. `TARGET` can be -formatted like `SOURCE`, but it can also be a single exclamation mark without -quotes (`!`). `ARROW` is one of `-->`, `-name->`, `-exact->`, `-re->` and -`-name-re->` +Each rule has the format `SOURCE ARROW TARGET` (e. g. `foo/bar --> foo/baz`). +The arrow specifies how the source and target are interpreted. The different +kinds of arrows are documented below. -If a rule's target is `!`, this means that when the rule matches on a path, the -corresponding file or directory is ignored. If a rule's target is missing, the -path is matched but not modified. +`SOURCE` and `TARGET` are either a bunch of characters without spaces (e. g. +`foo/bar`) or string literals (e. g, `"foo/b a r"`). The former syntax has no +concept of escaping characters, so the backslash is just another character. The +string literals however support Python's escape syntax (e. g. +`"foo\\bar\tbaz"`). This also means that in string literals, backslashes must be +escaped. + +`TARGET` can additionally be a single exclamation mark `!` (*not* `"!"`). When a +rule with a `!` as target matches a path, the corresponding file or directory is +ignored by the crawler instead of renamed. + +`TARGET` can also be omitted entirely. When a rule without target matches a +path, the path is returned unmodified. This is useful to prevent rules further +down from matching instead. + +Each arrow's behaviour can be modified slightly by changing the arrow's head +from `>` to `>>`. When a rule with a `>>` arrow head matches a path, it doesn't +return immediately like a normal arrow. Instead, it replaces the current path +with its output and continues on to the next rule. In effect, this means that +multiple rules can be applied sequentially. ### The `-->` arrow -The `-->` arrow is a basic renaming operation. If a path begins with `SOURCE`, -that part of the path is replaced with `TARGET`. This means that the rule -`foo/bar --> baz` would convert `foo/bar` into `baz`, but also `foo/bar/xyz` -into `baz/xyz`. The rule `foo --> !` would ignore a directory named `foo` as -well as all its contents. +The `-->` arrow is a basic renaming operation for files and directories. If a +path matches `SOURCE`, it is renamed to `TARGET`. + +Example: `foo/bar --> baz` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Converts `foo/bar` into `baz` +- Converts `foo/bar/wargl` into `bar/wargl` + +Example: `foo/bar --> !` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Ignores `foo/bar` and any of its children ### The `-name->` arrow The `-name->` arrow lets you rename files and directories by their name, regardless of where they appear in the file tree. Because of this, its `SOURCE` must not contain multiple path segments, only a single name. This restriction -does not apply to its `TARGET`. The `-name->` arrow is not applied recursively -to its own output to prevent infinite loops. +does not apply to its `TARGET`. -For example, the rule `foo -name-> bar/baz` would convert `a/foo` into -`a/bar/baz` and `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz`. The rule `foo --name-> !` would ignore all directories and files named `foo`. +Example: `foo -name-> bar/baz` +- Doesn't match `a/foobar/b` or `x/Foo/y/z` +- Converts `hello/foo` into `hello/bar/baz` +- Converts `foo/world` into `bar/baz/world` +- Converts `a/foo/b/c/foo` into `a/bar/baz/b/c/bar/baz` + +Example: `foo -name-> !` +- Doesn't match `a/foobar/b` or `x/Foo/y/z` +- Ignores any path containing a segment `foo` ### The `-exact->` arrow -The `-exact->` arrow requires the path to match `SOURCE` exactly. This means -that the rule `foo/bar -exact-> baz` would still convert `foo/bar` into `baz`, -but `foo/bar/xyz` would be unaffected. Also, `foo -exact-> !` would only ignore -`foo`, but not its contents (if it has any). The examples below show why this is -useful. +The `-exact->` arrow requires the path to match `SOURCE` exactly. The examples +below show why this is useful. + +Example: `foo/bar -exact-> baz` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Converts `foo/bar` into `baz` +- Doesn't match `foo/bar/wargl` + +Example: `foo/bar -exact-> !` +- Doesn't match `foo`, `a/foo/bar` or `foo/baz` +- Ignores only `foo/bar`, not its children ### The `-re->` arrow -The `-re->` arrow uses regular expressions. `SOURCE` is a regular expression -that must match the entire path. If this is the case, then the capturing groups -are available in `TARGET` for formatting. +The `-re->` arrow is like the `-->` arrow but with regular expressions. `SOURCE` +is a regular expression and `TARGET` an f-string based template. If a path +matches `SOURCE`, the output path is created using `TARGET` as template. +`SOURCE` is automatically anchored. `TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. @@ -288,18 +319,36 @@ can use `{i3:05}`. PFERD even allows you to write entire expressions inside the curly braces, for example `{g2.lower()}` or `{g3.replace(' ', '_')}`. +Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` +- Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` +- Converts `foo/bar` into `BOOH/fear` +- Converts `fooooo/bear` into `BOOOOOH/fear` +- Converts `foo/bar/baz` into `BOOH/fear/baz` + [3]: "Format String Syntax" ### The `-name-re->` arrow The `-name-re>` arrow is like a combination of the `-name->` and `-re->` arrows. -Instead of the `SOURCE` being the name of a directory or file, it's a regex that -is matched against the names of directories and files. `TARGET` works like the -`-re->` arrow's target. -For example, the arrow `(.*)\.jpeg -name-re-> {g1}.jpg` will rename all `.jpeg` -extensions into `.jpg`. The arrow `\..+ -name-re-> !` will ignore all files and -directories starting with `.`. +Example: `(.*)\.jpeg -name-re-> {g1}.jpg` +- Doesn't match `foo/bar.png`, `baz.JPEG` or `hello,jpeg` +- Converts `foo/bar.jpeg` into `foo/bar.jpg` +- Converts `foo.jpeg/bar/baz.jpeg` into `foo.jpg/bar/baz.jpg` + +Example: `\..+ -name-re-> !` +- Doesn't match `.`, `test`, `a.b` +- Ignores all files and directories starting with `.`. + +### The `-exact-re->` arrow + +The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` arrows. + +Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` +- Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` +- Converts `foo/bar` into `BOOH/fear` +- Converts `fooooo/bear` into `BOOOOOH/fear` +- Doesn't match `foo/bar/baz` ### Example: Tutorials @@ -327,7 +376,7 @@ The second rule is required for many crawlers since they use the rules to decide which directories to crawl. If it was missing when the crawler looks at `tutorials/`, the third rule would match. This means the crawler would not crawl the `tutorials/` directory and thus not discover that `tutorials/tut02/` -existed. +exists. Since the second rule is only relevant for crawling, the `TARGET` is left out. @@ -352,9 +401,9 @@ To do this, you can use the most powerful of arrows: The regex arrow. Note the escaped backslashes on the `SOURCE` side. -### Example: Crawl a python project +### Example: Crawl a Python project -You are crawling a python project and want to ignore all hidden files (files +You are crawling a Python project and want to ignore all hidden files (files whose name starts with a `.`), all `__pycache__` directories and all markdown files (for some weird reason). @@ -374,8 +423,7 @@ README.md ... ``` -For this task, the name arrows can be used. They are variants of the normal -arrows that only look at the file name instead of the entire path. +For this task, the name arrows can be used. ``` \..* -name-re-> ! From bc65ea7ab696bf3f455c49bad4ae4375a75182a8 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 9 Jun 2021 22:35:55 +0200 Subject: [PATCH 008/224] Fix mypy complaining about missing type hints --- scripts/setup | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/setup b/scripts/setup index b48fb1a..f6680bb 100755 --- a/scripts/setup +++ b/scripts/setup @@ -12,6 +12,6 @@ pip install --upgrade setuptools # Installing PFERD itself pip install --editable . -# Installing various tools -pip install --upgrade mypy flake8 autopep8 isort -pip install --upgrade pyinstaller +# Installing tools and type hints +pip install --upgrade mypy flake8 autopep8 isort pyinstaller +pip install --upgrade types-chardet types-certifi From a292c4c437d631d7eae3a0adfd98adbefd52c2eb Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 12 Jun 2021 14:57:29 +0200 Subject: [PATCH 009/224] Add example for ">>" arrow heads --- CONFIG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONFIG.md b/CONFIG.md index 1793ddc..f2710e1 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -430,3 +430,14 @@ For this task, the name arrows can be used. __pycache__ -name-> ! .*\.md -name-re-> ! ``` + +### Example: Clean up names + +You want to convert all paths into lowercase and replace spaces with underscores +before applying any rules. This can be achieved using the `>>` arrow heads. + +``` +(.*) -re->> "{g1.lower().replace(' ', '_')}" + + +``` From 601e4b936b320e766c0de18d384a92a5750f72b9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 12 Jun 2021 15:00:52 +0200 Subject: [PATCH 010/224] Use new arrow logic in README example config --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d25e86f..681bdf7 100644 --- a/README.md +++ b/README.md @@ -116,17 +116,18 @@ transform = Online-Tests --> ! Vorlesungswerbung --> ! + # Rename folders + Lehrbücher --> Vorlesung + # Note the ">>" arrow head which lets us apply further rules to files moved to "Übung" + Übungsunterlagen -->> Übung + # Move exercises to own folder. Rename them to "Blatt-XX.pdf" to make them sort properly - "Übungsunterlagen/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf + "Übung/(\d+). Übungsblatt.pdf" -re-> Blätter/Blatt-{i1:02}.pdf # Move solutions to own folder. Rename them to "Blatt-XX-Lösung.pdf" to make them sort properly - "Übungsunterlagen/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf + "Übung/(\d+). Übungsblatt.*Musterlösung.pdf" -re-> Blätter/Blatt-{i1:02}-Lösung.pdf # The course has nested folders with the same name - flatten them - "Übungsunterlagen/(.+?)/\\1/(.*)" -re-> Übung/{g1}/{g2} - - # Rename remaining folders - Übungsunterlagen --> Übung - Lehrbücher --> Vorlesung + "Übung/(.+?)/\\1" -re-> Übung/{g1} [crawl:Bar] type = kit-ilias-web From 70b33ecfd9ca3230303cc17f39fd8bc634737e2b Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 15:06:50 +0200 Subject: [PATCH 011/224] Add migration notes to changelog Also clean up some other formatting for consistency --- CHANGELOG.md | 5 +++++ CONFIG.md | 6 +++--- README.md | 6 +++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc6e81..d6049d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,11 @@ ambiguous situations. ## Unreleased +If your config file doesn't do weird things with transforms, it should continue +to work. If your `-re->` arrows behave weirdly, try replacing them with +`-exact-re->` arrows. If you're on Windows, you might need to switch from `\` +path separators to `/` in your regex rules. + ### Added - `skip` option for crawlers - Rules with `>>` instead of `>` as arrow head diff --git a/CONFIG.md b/CONFIG.md index f2710e1..19afbd2 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -342,7 +342,8 @@ Example: `\..+ -name-re-> !` ### The `-exact-re->` arrow -The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` arrows. +The `-exact-re>` arrow is like a combination of the `-exact->` and `-re->` +arrows. Example: `f(oo+)/be?ar -exactre-> B{g1.upper()}H/fear` - Doesn't match `a/foo/bar`, `foo/abc/bar`, `afoo/bar` or `foo/bars` @@ -375,8 +376,7 @@ tutorials --> ! The second rule is required for many crawlers since they use the rules to decide which directories to crawl. If it was missing when the crawler looks at `tutorials/`, the third rule would match. This means the crawler would not crawl -the `tutorials/` directory and thus not discover that `tutorials/tut02/` -exists. +the `tutorials/` directory and thus not discover that `tutorials/tut02/` exists. Since the second rule is only relevant for crawling, the `TARGET` is left out. diff --git a/README.md b/README.md index 681bdf7..836147f 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. ## Basic usage -PFERD can be run directly from the command line with no config file. -Run `pferd -h` to get an overview of available commands and options. -Run `pferd -h` to see which options a command has. +PFERD can be run directly from the command line with no config file. Run `pferd +-h` to get an overview of available commands and options. Run `pferd +-h` to see which options a command has. For example, you can download your personal desktop from the KIT ILIAS like this: From 70ec64a48ba8a56a819dfdbacba974f108d1206e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 15:39:22 +0200 Subject: [PATCH 012/224] Fix wrong base URL for multi-stage pages --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6049d2..c09f921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,9 @@ path separators to `/` in your regex rules. - Use the label to the left for exercises instead of the button name to determine the folder name +### Fixed +- Video pagination handling in ILIAS crawler + ## 3.0.1 - 2021-06-01 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index db9a303..384f0de 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -480,7 +480,7 @@ class IliasPage: return None if "opencast" in str(img_tag["alt"]).lower(): - return IliasElementType.VIDEO_FOLDER + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if str(img_tag["src"]).endswith("icon_exc.svg"): return IliasElementType.EXERCISE diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 78428e0..6495da9 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -253,7 +253,7 @@ instance's greatest bottleneck. soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, url, parent) + page = IliasPage(soup, next_stage_url, parent) next_stage_url = page.get_next_stage_url() elements.extend(page.get_child_elements()) From 57aef262179f72795e30f1c93254a32f084c0e23 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 16:32:22 +0200 Subject: [PATCH 013/224] Fix name arrows I seem to have (re-)implemented them incorrectly and never tested them. --- PFERD/transformer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index bf51d6a..a37443a 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -142,18 +142,23 @@ class RenamingPartsTf(Transformation): def transform(self, path: PurePath) -> TransformResult: result = PurePath() + any_part_matched = False for part in path.parts: transformed = self.sub_tf.transform(PurePath(part)) if not transformed: result /= part elif isinstance(transformed, Transformed): result /= transformed.path + any_part_matched = True elif isinstance(transformed, Ignored): return transformed else: raise RuntimeError(f"Invalid transform result of type {type(transformed)}: {transformed}") - return None + if any_part_matched: + return Transformed(result) + else: + return None class RuleParseError(Exception): From 6e4d423c812c52aff95249ad992dc4889d971208 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 13 Jun 2021 16:50:29 +0200 Subject: [PATCH 014/224] Crawl all video stages in one crawl bar This ensures folders are not renamed, as they are crawled twice --- PFERD/crawl/ilias/kit_ilias_html.py | 6 ++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 384f0de..41f45e2 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -62,9 +62,11 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() - def get_next_stage_url(self) -> Optional[str]: + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): - return self.get_child_elements()[0].url + return self.get_child_elements()[0] + if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + return self._find_video_entries_paginated()[0] return None def _is_video_player(self) -> bool: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 6495da9..41c301c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -248,13 +248,18 @@ instance's greatest bottleneck. elements.clear() async with cl: next_stage_url: Optional[str] = url + current_parent = parent while next_stage_url: soup = await self._get_page(next_stage_url) log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") log.explain(f"URL: {next_stage_url}") - page = IliasPage(soup, next_stage_url, parent) - next_stage_url = page.get_next_stage_url() + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None elements.extend(page.get_child_elements()) From 75fde870c2cc4b0f8b87c80cae87e61f9379ddd2 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 13 Jun 2021 17:23:18 +0200 Subject: [PATCH 015/224] Bump version to 3.1.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c09f921..427219e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.1.0 - 2021-06-13 + If your config file doesn't do weird things with transforms, it should continue to work. If your `-re->` arrows behave weirdly, try replacing them with `-exact-re->` arrows. If you're on Windows, you might need to switch from `\` diff --git a/PFERD/version.py b/PFERD/version.py index 2aae99d..8ce7ae4 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.0.1" +VERSION = "3.1.0" From 80eeb8fe97e28437dcce0e148ffba202fde6a156 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 1 Jul 2021 11:01:55 +0200 Subject: [PATCH 016/224] Add --skip option --- PFERD/__main__.py | 2 +- PFERD/cli/parser.py | 8 ++++++++ PFERD/pferd.py | 24 +++++++++++++++++++----- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b274b6b..b665feb 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -116,7 +116,7 @@ def main() -> None: sys.exit() try: - pferd = Pferd(config, args.crawler) + pferd = Pferd(config, args.crawler, args.skip) except PferdLoadError as e: log.unlock() log.error(str(e)) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index f5fb215..e753023 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -181,6 +181,14 @@ PARSER.add_argument( help="only execute a single crawler." " Can be specified multiple times to execute multiple crawlers" ) +PARSER.add_argument( + "--skip", "-S", + action="append", + type=str, + metavar="NAME", + help="don't execute this particular crawler." + " Can be specified multiple times to skip multiple crawlers" +) PARSER.add_argument( "--working-dir", type=Path, diff --git a/PFERD/pferd.py b/PFERD/pferd.py index d98b426..726ed45 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -15,13 +15,13 @@ class PferdLoadError(Exception): class Pferd: - def __init__(self, config: Config, cli_crawlers: Optional[List[str]]): + def __init__(self, config: Config, cli_crawlers: Optional[List[str]], cli_skips: Optional[List[str]]): """ May throw PferdLoadError. """ self._config = config - self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers) + self._crawlers_to_run = self._find_crawlers_to_run(config, cli_crawlers, cli_skips) self._authenticators: Dict[str, Authenticator] = {} self._crawlers: Dict[str, Crawler] = {} @@ -65,16 +65,30 @@ class Pferd: return crawlers_to_run - def _find_crawlers_to_run(self, config: Config, cli_crawlers: Optional[List[str]]) -> List[str]: + def _find_crawlers_to_run( + self, + config: Config, + cli_crawlers: Optional[List[str]], + cli_skips: Optional[List[str]], + ) -> List[str]: log.explain_topic("Deciding which crawlers to run") + crawlers: List[str] if cli_crawlers is None: log.explain("No crawlers specified on CLI") log.explain("Running crawlers specified in config") - return self._find_config_crawlers(config) + crawlers = self._find_config_crawlers(config) else: log.explain("Crawlers specified on CLI") - return self._find_cli_crawlers(config, cli_crawlers) + crawlers = self._find_cli_crawlers(config, cli_crawlers) + + skips = {f"crawl:{name}" for name in cli_skips} if cli_skips else set() + for crawler in crawlers: + if crawler in skips: + log.explain(f"Skipping crawler {crawler!r}") + crawlers = [crawler for crawler in crawlers if crawler not in skips] + + return crawlers def _load_authenticators(self) -> None: for name, section in self._config.auth_sections(): From 9ffd6033575ed0ed603663e60bd00b8adb5b8295 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 1 Jul 2021 11:14:50 +0200 Subject: [PATCH 017/224] Error when using multiple segments with -name-> Previously, PFERD just silently never matched the -name-> arrow. Now, it errors when loading the config file. --- PFERD/transformer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index a37443a..1a56e27 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -41,9 +41,11 @@ TransformResult = Optional[Union[Transformed, Ignored]] @dataclass class Rule: left: str + left_index: int name: str head: ArrowHead right: RightSide + right_index: int def right_result(self, path: PurePath) -> Union[str, Transformed, Ignored]: if isinstance(self.right, str): @@ -345,6 +347,7 @@ def parse_eol(line: Line) -> None: def parse_rule(line: Line) -> Rule: parse_zero_or_more_spaces(line) + left_index = line.index left = parse_left(line) parse_one_or_more_spaces(line) @@ -354,19 +357,19 @@ def parse_rule(line: Line) -> Rule: line.expect("-") head = parse_arrow_head(line) - index = line.index + right_index = line.index right: RightSide try: parse_zero_or_more_spaces(line) parse_eol(line) right = Empty() except RuleParseError: - line.index = index + line.index = right_index parse_one_or_more_spaces(line) right = parse_right(line) parse_eol(line) - return Rule(left, name, head, right) + return Rule(left, left_index, name, head, right, right_index) def parse_transformation(line: Line) -> Transformation: @@ -377,6 +380,9 @@ def parse_transformation(line: Line) -> Transformation: elif rule.name == "exact": return ExactTf(rule) elif rule.name == "name": + if len(PurePath(rule.left).parts) > 1: + line.index = rule.left_index + raise RuleParseError(line, "Expected name, not multiple segments") return RenamingPartsTf(ExactTf(rule)) elif rule.name == "re": return RenamingParentsTf(ExactReTf(rule)) From 91200f3684973f40d6409ce38368eceb6e73da0f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 3 Jul 2021 12:07:18 +0200 Subject: [PATCH 018/224] Fix nondeterministic name deduplication --- PFERD/crawl/crawler.py | 8 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 145 +++++++++++++-------- 2 files changed, 93 insertions(+), 60 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d61783f..d798bc3 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -56,7 +56,7 @@ def noncritical(f: Wrapped) -> Wrapped: return wrapper # type: ignore -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) def anoncritical(f: AWrapped) -> AWrapped: @@ -72,14 +72,14 @@ def anoncritical(f: AWrapped) -> AWrapped: Warning: Must only be applied to member functions of the Crawler class! """ - async def wrapper(*args: Any, **kwargs: Any) -> None: + async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: if not (args and isinstance(args[0], Crawler)): raise RuntimeError("@anoncritical must only applied to Crawler methods") crawler = args[0] try: - await f(*args, **kwargs) + return await f(*args, **kwargs) except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: log.warn(str(e)) crawler.error_free = False @@ -87,6 +87,8 @@ def anoncritical(f: AWrapped) -> AWrapped: crawler.error_free = False raise + return None + return wrapper # type: ignore diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 41c301c..a61eb4e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -12,7 +12,7 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlWarning, anoncritical +from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement @@ -81,17 +81,16 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[None]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: - async def wrapper(*args: Any, **kwargs: Any) -> None: + async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: last_exception: Optional[BaseException] = None for round in range(attempts): try: - await f(*args, **kwargs) - return + return await f(*args, **kwargs) except aiohttp.ContentTypeError: # invalid content type raise CrawlWarning("ILIAS returned an invalid content type") except aiohttp.TooManyRedirects: @@ -230,17 +229,33 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() - tasks = [self._handle_ilias_element(PurePath("."), element) for element in elements] + + tasks: List[Awaitable[None]] = [] + for element in elements: + if handle := await self._handle_ilias_element(PurePath("."), element): + tasks.append(asyncio.create_task(handle)) # And execute them await self.gather(tasks) - async def _handle_ilias_page(self, url: str, parent: IliasPageElement, path: PurePath) -> None: + async def _handle_ilias_page( + self, + url: str, + parent: IliasPageElement, + path: PurePath, + ) -> Optional[Awaitable[None]]: maybe_cl = await self.crawl(path) if not maybe_cl: - return - cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None + return self._crawl_ilias_page(url, parent, path, maybe_cl) + async def _crawl_ilias_page( + self, + url: str, + parent: IliasPageElement, + path: PurePath, + cl: CrawlToken, + ) -> None: elements: List[IliasPageElement] = [] @_iorepeat(3, "crawling folder") @@ -265,7 +280,11 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() - tasks = [self._handle_ilias_element(cl.path, element) for element in elements] + + tasks: List[Awaitable[None]] = [] + for element in elements: + if handle := await self._handle_ilias_element(cl.path, element): + tasks.append(asyncio.create_task(handle)) # And execute them await self.gather(tasks) @@ -274,7 +293,11 @@ instance's greatest bottleneck. # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical. # If that happens we will be terminated as anoncritical doesn't tream them as non-critical. @_wrap_io_in_warning("handling ilias element") - async def _handle_ilias_element(self, parent_path: PurePath, element: IliasPageElement) -> None: + async def _handle_ilias_element( + self, + parent_path: PurePath, + element: IliasPageElement, + ) -> Optional[Awaitable[None]]: element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: @@ -282,35 +305,41 @@ instance's greatest bottleneck. if not self._videos: log.explain("Video crawling is disabled") log.explain("Answer: no") - return + return None else: log.explain("Video crawling is enabled") log.explain("Answer: yes") if element.type == IliasElementType.FILE: - await self._download_file(element, element_path) + return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Forums are not supported") log.explain("Answer: No") + return None elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") log.explain("Answer: No") + return None elif element.type == IliasElementType.LINK: - await self._download_link(element, element_path) + return await self._handle_link(element, element_path) elif element.type == IliasElementType.VIDEO: - await self._download_file(element, element_path) + return await self._handle_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: - await self._download_video(element, element_path) + return await self._handle_video(element, element_path) elif element.type in _DIRECTORY_PAGES: - await self._handle_ilias_page(element.url, element, element_path) + return await self._handle_ilias_page(element.url, element, element_path) else: # This will retry it a few times, failing everytime. It doesn't make any network # requests, so that's fine. raise CrawlWarning(f"Unknown element type: {element.type!r}") - async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_link( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -318,32 +347,30 @@ instance's greatest bottleneck. link_extension = self._links.extension() if not link_template_maybe or not link_extension: log.explain("Answer: No") - return + return None else: log.explain("Answer: Yes") - link_template = link_template_maybe element_path = element_path.with_name(element_path.name + link_extension) maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None - @_iorepeat(3, "resolving link") - async def impl() -> None: - async with dl as (bar, sink): - export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") - real_url = await self._resolve_link_target(export_url) + return self._download_link(element, link_template_maybe, maybe_dl) - content = link_template - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + @_iorepeat(3, "resolving link") + async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: + async with dl as (bar, sink): + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") + real_url = await self._resolve_link_target(export_url) - await impl() + content = link_template + content = content.replace("{{link}}", real_url) + content = content.replace("{{name}}", element.name) + content = content.replace("{{description}}", str(element.description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() async def _resolve_link_target(self, export_url: str) -> str: async with self.session.get(export_url, allow_redirects=False) as resp: @@ -360,39 +387,43 @@ instance's greatest bottleneck. raise CrawlError("resolve_link_target failed even after authenticating") - async def _download_video(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_video( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: # Videos will NOT be redownloaded - their content doesn't really change and they are chunky maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None - @_iorepeat(3, "downloading video") - async def impl() -> None: - assert dl # The function is only reached when dl is not None - async with dl as (bar, sink): - page = IliasPage(await self._get_page(element.url), element.url, element) - real_element = page.get_child_elements()[0] + return self._download_video(element, maybe_dl) - log.explain(f"Streaming video from real url {real_element.url}") + @_iorepeat(3, "downloading video") + async def _download_video(self, element: IliasPageElement, dl: DownloadToken) -> None: + async with dl as (bar, sink): + page = IliasPage(await self._get_page(element.url), element.url, element) + real_element = page.get_child_elements()[0] - await self._stream_from_url(real_element.url, sink, bar, is_video=True) + log.explain(f"Streaming video from real url {real_element.url}") - await impl() + await self._stream_from_url(real_element.url, sink, bar, is_video=True) - async def _download_file(self, element: IliasPageElement, element_path: PurePath) -> None: + async def _handle_file( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: - return - dl = maybe_dl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 + return None + return self._download_file(element, maybe_dl) - @_iorepeat(3, "downloading file") - async def impl() -> None: - assert dl # The function is only reached when dl is not None - async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar, is_video=False) - - await impl() + @_iorepeat(3, "downloading file") + async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: + assert dl # The function is only reached when dl is not None + async with dl as (bar, sink): + await self._stream_from_url(element.url, sink, bar, is_video=False) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: From 89be07d4d3562c75f10539c7a51c171933d3de82 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 3 Jul 2021 17:05:48 +0200 Subject: [PATCH 019/224] Use final crawl path in HTML parsing message --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a61eb4e..83cac32 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -247,13 +247,12 @@ instance's greatest bottleneck. maybe_cl = await self.crawl(path) if not maybe_cl: return None - return self._crawl_ilias_page(url, parent, path, maybe_cl) + return self._crawl_ilias_page(url, parent, maybe_cl) async def _crawl_ilias_page( self, url: str, parent: IliasPageElement, - path: PurePath, cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] @@ -267,7 +266,7 @@ instance's greatest bottleneck. while next_stage_url: soup = await self._get_page(next_stage_url) - log.explain_topic(f"Parsing HTML page for {fmt_path(path)}") + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain(f"URL: {next_stage_url}") page = IliasPage(soup, next_stage_url, current_parent) if next_element := page.get_next_stage_element(): From 8ec3f41251cf69a365c9009400e67d539bb4afc4 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 6 Jul 2021 16:13:23 +0200 Subject: [PATCH 020/224] Crawl ilias booking objects as links --- PFERD/crawl/ilias/kit_ilias_html.py | 4 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 58 +++++++++++++++++++--- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 41f45e2..247002b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + BOOKING = "booking" MEETING = "meeting" VIDEO = "video" VIDEO_PLAYER = "video_player" @@ -490,6 +491,9 @@ class IliasPage: if str(img_tag["src"]).endswith("icon_webr.svg"): return IliasElementType.LINK + if str(img_tag["src"]).endswith("icon_book.svg"): + return IliasElementType.BOOKING + if str(img_tag["src"]).endswith("frm.svg"): return IliasElementType.FORUM diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 83cac32..a0e323b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -323,6 +323,8 @@ instance's greatest bottleneck. return None elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) + elif element.type == IliasElementType.BOOKING: + return await self._handle_booking(element, element_path) elif element.type == IliasElementType.VIDEO: return await self._handle_file(element, element_path) elif element.type == IliasElementType.VIDEO_PLAYER: @@ -362,14 +364,56 @@ instance's greatest bottleneck. async with dl as (bar, sink): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) + self._write_link_content(link_template, real_url, element.name, element.description, sink) - content = link_template - content = content.replace("{{link}}", real_url) - content = content.replace("{{name}}", element.name) - content = content.replace("{{description}}", str(element.description)) - content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) - sink.file.write(content.encode("utf-8")) - sink.done() + def _write_link_content( + self, + link_template: str, + url: str, + name: str, + description: Optional[str], + sink: FileSink, + ) -> None: + content = link_template + content = content.replace("{{link}}", url) + content = content.replace("{{name}}", name) + content = content.replace("{{description}}", str(description)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) + sink.file.write(content.encode("utf-8")) + sink.done() + + async def _handle_booking( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Awaitable[None]]: + log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") + log.explain(f"Links type is {self._links}") + + link_template_maybe = self._links.template() + link_extension = self._links.extension() + if not link_template_maybe or not link_extension: + log.explain("Answer: No") + return None + else: + log.explain("Answer: Yes") + element_path = element_path.with_name(element_path.name + link_extension) + + maybe_dl = await self.download(element_path, mtime=element.mtime) + if not maybe_dl: + return None + + return self._download_booking(element, link_template_maybe, maybe_dl) + + @_iorepeat(3, "resolving booking") + async def _download_booking( + self, + element: IliasPageElement, + link_template: str, + dl: DownloadToken, + ) -> None: + async with dl as (bar, sink): + self._write_link_content(link_template, element.url, element.name, element.description, sink) async def _resolve_link_target(self, export_url: str) -> str: async with self.session.get(export_url, allow_redirects=False) as resp: From ee67f9f4725be9f418d66b85bb8a749de8e5d713 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 6 Jul 2021 17:45:12 +0200 Subject: [PATCH 021/224] Sort elements by ILIAS id to ensure deterministic ordering --- PFERD/crawl/ilias/kit_ilias_html.py | 11 +++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 247002b..7e91926 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -38,6 +38,17 @@ class IliasPageElement: mtime: Optional[datetime] = None description: Optional[str] = None + def id(self) -> str: + regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + + for regex in regexes: + if match := re.search(regex, self.url): + return match.groupdict()["id"] + + # Fall back to URL + log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") + return self.url + class IliasPage: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a0e323b..cca6987 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -230,6 +230,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(PurePath("."), element): @@ -280,6 +282,8 @@ instance's greatest bottleneck. # Fill up our task list with the found elements await gather_elements() + elements.sort(key=lambda e: e.id()) + tasks: List[Awaitable[None]] = [] for element in elements: if handle := await self._handle_ilias_element(cl.path, element): From 86f79ff1f137f6f728df08a51b12acb096e00979 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 7 Jul 2021 14:26:20 +0200 Subject: [PATCH 022/224] Update changelog --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 427219e..20dd53c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,19 @@ ambiguous situations. ## Unreleased +### Added +- `--skip` command line option +- Support for ILIAS booking objects + +### Changed +- Using multiple path segments on left side of `-name->` now results in an + error. This was already forbidden by the documentation but silently accepted + by PFERD. +- More consistent path printing in some `--explain` messages + +### Fixed +- Nondeterministic name deduplication due to ILIAS reordering elements + ## 3.1.0 - 2021-06-13 If your config file doesn't do weird things with transforms, it should continue From 544d45cbc570080964ab50044301b304343f9a31 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 13 Jul 2021 15:42:11 +0200 Subject: [PATCH 023/224] Catch non-critical exceptions at crawler top level --- CHANGELOG.md | 1 + PFERD/crawl/crawler.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20dd53c..181ef99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. ### Fixed - Nondeterministic name deduplication due to ILIAS reordering elements +- More exceptions are handled properly ## 3.1.0 - 2021-06-13 diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index d798bc3..c492ee9 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -320,6 +320,7 @@ class Crawler(ABC): log.explain("Warnings or errors occurred during this run") log.explain("Answer: No") + @anoncritical async def run(self) -> None: """ Start the crawling process. Call this function if you want to use a From 742632ed8d6cebd10c7e28902afba2fccb108712 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 4 Aug 2021 18:27:26 +0000 Subject: [PATCH 024/224] Bump version to 3.2.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 181ef99..1ac3a8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.2.0 - 2021-08-04 + ### Added - `--skip` command line option - Support for ILIAS booking objects diff --git a/PFERD/version.py b/PFERD/version.py index 8ce7ae4..b8efadd 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.1.0" +VERSION = "3.2.0" From 66730773977a2602aebd5396efc1c6d8bd7b0ad7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 21 Oct 2021 12:01:41 +0200 Subject: [PATCH 025/224] Add kit-ipd crawler --- CHANGELOG.md | 1 + CONFIG.md | 7 ++ PFERD/cli/__init__.py | 1 + PFERD/cli/command_kit_ipd.py | 46 +++++++++++ PFERD/crawl/__init__.py | 3 + PFERD/crawl/kit_ipd_crawler.py | 138 +++++++++++++++++++++++++++++++++ 6 files changed, 196 insertions(+) create mode 100644 PFERD/cli/command_kit_ipd.py create mode 100644 PFERD/crawl/kit_ipd_crawler.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ac3a8d..cca4839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. ### Added - `--skip` command line option - Support for ILIAS booking objects +- A KIT IPD crawler ### Changed - Using multiple path segments on left side of `-name->` now results in an diff --git a/CONFIG.md b/CONFIG.md index 19afbd2..06b9246 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -136,6 +136,13 @@ crawler simulate a slower, network-based crawler. requests. (Default: `0.0`) - `download_speed`: Download speed (in bytes per second) to simulate. (Optional) +### The `kit-ipd` crawler + +This crawler crals a KIT ipd page by url. The root page can be crawled from +outside the KIT network so you will be informed about any new/deleted files, +but downloading files requires you to be within. Adding a show delay between +requests is likely a good idea. + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/PFERD/cli/__init__.py b/PFERD/cli/__init__.py index d70ecd9..efa8f00 100644 --- a/PFERD/cli/__init__.py +++ b/PFERD/cli/__init__.py @@ -9,4 +9,5 @@ from . import command_local # noqa: F401 imported but unused from . import command_kit_ilias_web # noqa: F401 imported but unused +from . import command_kit_ipd # noqa: F401 imported but unused from .parser import PARSER, ParserLoadError, load_default_section # noqa: F401 imported but unused diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py new file mode 100644 index 0000000..480cc9b --- /dev/null +++ b/PFERD/cli/command_kit_ipd.py @@ -0,0 +1,46 @@ +import argparse +import configparser +from pathlib import Path + +from ..logging import log +from .parser import CRAWLER_PARSER, SUBPARSERS, load_crawler + +SUBPARSER = SUBPARSERS.add_parser( + "kit-ipd", + parents=[CRAWLER_PARSER], +) + +GROUP = SUBPARSER.add_argument_group( + title="kit ipd crawler arguments", + description="arguments for the 'kit-ipd' crawler", +) +GROUP.add_argument( + "target", + type=str, + metavar="TARGET", + help="url to crawl" +) +GROUP.add_argument( + "output", + type=Path, + metavar="OUTPUT", + help="output directory" +) + + +def load( + args: argparse.Namespace, + parser: configparser.ConfigParser, +) -> None: + log.explain("Creating config for command 'kit-ipd'") + + parser["crawl:kit-ipd"] = {} + section = parser["crawl:ipd"] + load_crawler(args, section) + + section["type"] = "kit-ipd" + section["target"] = str(args.target) + section["output_dir"] = str(args.output) + + +SUBPARSER.set_defaults(command=load) diff --git a/PFERD/crawl/__init__.py b/PFERD/crawl/__init__.py index 7eb2fb1..1f8bd59 100644 --- a/PFERD/crawl/__init__.py +++ b/PFERD/crawl/__init__.py @@ -5,6 +5,7 @@ from ..auth import Authenticator from ..config import Config from .crawler import Crawler, CrawlError, CrawlerSection # noqa: F401 from .ilias import KitIliasWebCrawler, KitIliasWebCrawlerSection +from .kit_ipd_crawler import KitIpdCrawler, KitIpdCrawlerSection from .local_crawler import LocalCrawler, LocalCrawlerSection CrawlerConstructor = Callable[[ @@ -19,4 +20,6 @@ CRAWLERS: Dict[str, CrawlerConstructor] = { LocalCrawler(n, LocalCrawlerSection(s), c), "kit-ilias-web": lambda n, s, c, a: KitIliasWebCrawler(n, KitIliasWebCrawlerSection(s), c, a), + "kit-ipd": lambda n, s, c, a: + KitIpdCrawler(n, KitIpdCrawlerSection(s), c), } diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py new file mode 100644 index 0000000..4d4addd --- /dev/null +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -0,0 +1,138 @@ +import os +from dataclasses import dataclass +from pathlib import PurePath +from typing import List, Set, Union +from urllib.parse import urljoin + +from bs4 import BeautifulSoup, Tag + +from ..config import Config +from ..logging import ProgressBar, log +from ..output_dir import FileSink +from ..utils import soupify +from .crawler import CrawlError +from .http_crawler import HttpCrawler, HttpCrawlerSection + + +class KitIpdCrawlerSection(HttpCrawlerSection): + def target(self) -> str: + target = self.s.get("target") + if not target: + self.missing_value("target") + + if not target.startswith("https://"): + self.invalid_value("target", target, "Should be a URL") + + return target + + +@dataclass +class KitIpdFile: + name: str + url: str + + +@dataclass +class KitIpdFolder: + name: str + files: List[KitIpdFile] + + +class KitIpdCrawler(HttpCrawler): + + def __init__( + self, + name: str, + section: KitIpdCrawlerSection, + config: Config, + ): + super().__init__(name, section, config) + self._url = section.target() + + async def _run(self) -> None: + maybe_cl = await self.crawl(PurePath(".")) + if not maybe_cl: + return + + folders: List[KitIpdFolder] = [] + + async with maybe_cl: + folder_tags = await self._fetch_folder_tags() + folders = [self._extract_folder(tag) for tag in folder_tags] + + tasks = [self._crawl_folder(folder) for folder in folders] + + await self.gather(tasks) + + async def _crawl_folder(self, folder: KitIpdFolder) -> None: + path = PurePath(folder.name) + if not await self.crawl(path): + return + + tasks = [self._download_file(path, file) for file in folder.files] + + await self.gather(tasks) + + async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + element_path = parent / file.name + maybe_dl = await self.download(element_path) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + await self._stream_from_url(file.url, sink, bar) + + async def _fetch_folder_tags(self) -> Set[Tag]: + page = await self.get_page() + elements: List[Tag] = self._find_file_links(page) + folder_tags: Set[Tag] = set() + + for element in elements: + enclosing_data: Tag = element.findParent(name="td") + label: Tag = enclosing_data.findPreviousSibling(name="td") + folder_tags.add(label) + + return folder_tags + + def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + name = folder_tag.getText().strip() + files: List[KitIpdFile] = [] + + container: Tag = folder_tag.findNextSibling(name="td") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) + + log.explain_topic(f"Found folder {name!r}") + for file in files: + log.explain(f"Found file {file.name!r}") + + return KitIpdFolder(name, files) + + def _extract_file(self, link: Tag) -> KitIpdFile: + name = link.getText().strip() + url = self._abs_url_from_link(link) + _, extension = os.path.splitext(url) + return KitIpdFile(name + extension, url) + + def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: + return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) + + def _abs_url_from_link(self, link_tag: Tag) -> str: + return urljoin(self._url, link_tag.get("href")) + + async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async with self.session.get(url, allow_redirects=False) as resp: + if resp.status == 403: + raise CrawlError("Received a 403. Are you within the KIT network/VPN?") + if resp.content_length: + bar.set_total(resp.content_length) + + async for data in resp.content.iter_chunked(1024): + sink.file.write(data) + bar.advance(len(data)) + + sink.done() + + async def get_page(self) -> BeautifulSoup: + async with self.session.get(self._url) as request: + return soupify(await request.read()) From fee12b3d9e8469d37b972f28d84a7d44538744bc Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 25 Oct 2021 17:44:12 +0000 Subject: [PATCH 026/224] Fix changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cca4839..522d96d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,12 +22,14 @@ ambiguous situations. ## Unreleased +### Added +- A KIT IPD crawler + ## 3.2.0 - 2021-08-04 ### Added - `--skip` command line option - Support for ILIAS booking objects -- A KIT IPD crawler ### Changed - Using multiple path segments on left side of `-name->` now results in an From 55ea304ff338f249914b95938675a4e9b07d0875 Mon Sep 17 00:00:00 2001 From: lukasprobst Date: Mon, 25 Oct 2021 22:32:54 +0200 Subject: [PATCH 027/224] Disable interpolation of ConfigParser --- CHANGELOG.md | 3 +++ CONFIG.md | 6 +++--- LICENSE | 2 +- PFERD/__main__.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 522d96d..a90c978 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Added - A KIT IPD crawler +### Removed +- [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file + ## 3.2.0 - 2021-08-04 ### Added diff --git a/CONFIG.md b/CONFIG.md index 06b9246..4d2ec33 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -4,11 +4,11 @@ A config file consists of sections. A section begins with a `[section]` header, which is followed by a list of `key = value` pairs. Comments must be on their own line and start with `#`. Multiline values must be indented beyond their key. Boolean values can be `yes` or `no`. For more details and some examples on the -format, see the [configparser documentation][1] ([basic interpolation][2] is -enabled). +format, see the [configparser documentation][1] ([interpolation][2] is +disabled). [1]: "Supported INI File Structure" -[2]: "BasicInterpolation" +[2]: "Interpolation of values" ## The `DEFAULT` section diff --git a/LICENSE b/LICENSE index 01f15f5..c096c4a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2019-2020 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim +Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/__main__.py b/PFERD/__main__.py index b665feb..bdf5b34 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -15,7 +15,7 @@ from .transformer import RuleParseError def load_config_parser(args: argparse.Namespace) -> configparser.ConfigParser: log.explain_topic("Loading config") - parser = configparser.ConfigParser() + parser = configparser.ConfigParser(interpolation=None) if args.command is None: log.explain("No CLI command specified, loading config from file") From ef7d5ea2d3282e71cf0ba82698e409483cc1ad0a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:09:05 +0200 Subject: [PATCH 028/224] Allow storing crawler-specific data in reports --- PFERD/report.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/PFERD/report.py b/PFERD/report.py index 919bb35..99a4661 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -1,6 +1,6 @@ import json from pathlib import Path, PurePath -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set class ReportLoadError(Exception): @@ -67,6 +67,7 @@ class Report: self.deleted_files: Set[PurePath] = set() # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() + self.custom: Dict[str, Any] = dict() @staticmethod def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: @@ -81,6 +82,15 @@ class Report: return result + @staticmethod + def _get_str_dictionary(data: Dict[str, Any], key: str) -> Dict[str, Any]: + result: Dict[str, Any] = data.get(key, {}) + + if not isinstance(result, dict): + raise ReportLoadError(f"Incorrect format: {key!r} is not a dictionary") + + return result + @classmethod def load(cls, path: Path) -> "Report": """ @@ -108,6 +118,7 @@ class Report: self.delete_file(PurePath(elem)) for elem in self._get_list_of_strs(data, "not_deleted"): self.not_delete_file(PurePath(elem)) + self.custom = self._get_str_dictionary(data, "custom") return self @@ -124,6 +135,7 @@ class Report: "changed": [str(path) for path in sorted(self.changed_files)], "deleted": [str(path) for path in sorted(self.deleted_files)], "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], + "custom": self.custom } with open(path, "w") as f: @@ -190,3 +202,15 @@ class Report: """ self.not_deleted_files.add(path) + + def add_custom_value(self, key: str, value: Any) -> None: + """ + Adds a custom value under the passed key, overwriting any existing + """ + self.custom[key] = value + + def get_custom_value(self, key: str) -> Optional[Any]: + """ + Retrieves a custom value for the given key. + """ + return self.custom.get(key) From f9a3f9b9f2702796f64d11d5d649261ea76a908d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:12:29 +0200 Subject: [PATCH 029/224] Handle multi-stream videos --- PFERD/crawl/ilias/kit_ilias_html.py | 18 ++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 86 +++++++++++++++++++--- 2 files changed, 92 insertions(+), 12 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7e91926..78ae084 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -133,9 +133,21 @@ class IliasPage: # parse it json_object = json.loads(json_str) - # and fetch the video url! - video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + streams = [stream for stream in json_object["streams"] if stream["type"] == "video"] + + # and just fetch the lone video url! + if len(streams) == 1: + video_url = streams[0]["sources"]["mp4"][0]["src"] + return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + + log.explain(f"Found multiple videos for stream at {self._source_name}") + items = [] + for stream in sorted(streams, key=lambda stream: stream["content"]): + full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" + video_url = stream["sources"]["mp4"][0]["src"] + items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + + return items def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index cca6987..f483754 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,7 +1,7 @@ import asyncio import re from pathlib import PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union +from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast import aiohttp from aiohttp import hdrs @@ -439,22 +439,90 @@ instance's greatest bottleneck. element: IliasPageElement, element_path: PurePath, ) -> Optional[Awaitable[None]]: - # Videos will NOT be redownloaded - their content doesn't really change and they are chunky - maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.NEVER) - if not maybe_dl: + # Copy old mapping as it is likely still relevant + if self.prev_report: + self.report.add_custom_value( + str(element_path), + self.prev_report.get_custom_value(str(element_path)) + ) + + # A video might contain other videos, so let's "crawl" the video first + # to ensure rate limits apply. This must be a download as *this token* + # is re-used if the video consists of a single stream. In that case the + # file name is used and *not* the stream name the ilias html parser reported + # to ensure backwards compatibility. + maybe_dl = await self.download(element_path, redownload=Redownload.ALWAYS) + + # If we do not want to crawl it (user filter) or we have every file + # from the cached mapping already, we can ignore this and bail + if not maybe_dl or self._all_videos_locally_present(element_path): + # Mark all existing cideos as known so they do not get deleted + # during dleanup. We "downloaded" them, just without actually making + # a network request as we assumed they did not change. + for video in self._previous_contained_videos(element_path): + await self.download(video) + return None - return self._download_video(element, maybe_dl) + return self._download_video(element_path, element, maybe_dl) + + def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]: + if not self.prev_report: + return [] + custom_value = self.prev_report.get_custom_value(str(video_path)) + if not custom_value: + return [] + names = cast(List[str], custom_value) + folder = video_path.parent + return [PurePath(folder, name) for name in names] + + def _all_videos_locally_present(self, video_path: PurePath) -> bool: + if contained_videos := self._previous_contained_videos(video_path): + log.explain_topic(f"Checking local cache for video {video_path.name}") + all_found_locally = True + for video in contained_videos: + all_found_locally = all_found_locally and self._output_dir.resolve(video).exists() + if all_found_locally: + log.explain("Found all videos locally, skipping enumeration request") + return True + log.explain("Missing at least one video, continuing with requests!") + return False @_iorepeat(3, "downloading video") - async def _download_video(self, element: IliasPageElement, dl: DownloadToken) -> None: + async def _download_video( + self, + original_path: PurePath, + element: IliasPageElement, + dl: DownloadToken + ) -> None: + stream_elements: List[IliasPageElement] = [] async with dl as (bar, sink): page = IliasPage(await self._get_page(element.url), element.url, element) - real_element = page.get_child_elements()[0] + stream_elements = page.get_child_elements() - log.explain(f"Streaming video from real url {real_element.url}") + if len(stream_elements) > 1: + log.explain(f"Found multiple video streams for {element.name}") + else: + log.explain(f"Using single video mode for {element.name}") + stream_element = stream_elements[0] + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + self.report.add_custom_value(str(original_path), [original_path.name]) + return - await self._stream_from_url(real_element.url, sink, bar, is_video=True) + contained_video_paths: List[str] = [] + + for stream_element in stream_elements: + contained_video_paths.append(stream_element.name) + video_path = original_path.parent / stream_element.name + + maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) + if not maybe_dl: + continue + async with maybe_dl as (bar, sink): + log.explain(f"Streaming video from real url {stream_element.url}") + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + + self.report.add_custom_value(str(original_path), contained_video_paths) async def _handle_file( self, From e42ab83d32ce852eb26e1a21982399e2988e769a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 25 Oct 2021 11:07:25 +0200 Subject: [PATCH 030/224] Add support for ILIAS cards --- PFERD/crawl/ilias/kit_ilias_html.py | 94 ++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 78ae084..d8c347d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -368,6 +368,8 @@ class IliasPage: log.explain(f"Found {element_name!r}") result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) + result += self._find_cards() + return result def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: @@ -450,6 +452,90 @@ class IliasPage: log.explain(f"Found file {full_path!r}") return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) + def _find_cards(self) -> List[IliasPageElement]: + result: List[IliasPageElement] = [] + + card_titles: List[Tag] = self._soup.select(".card-title a") + + for title in card_titles: + url = self._abs_url_from_link(title) + name = _sanitize_path_name(title.getText().strip()) + type = self._find_type_from_card(title) + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {title}") + continue + + result.append(IliasPageElement(type, url, name)) + + card_button_tiles: List[Tag] = self._soup.select(".card-title button") + + for button in card_button_tiles: + regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") + res = regex.search(str(self._soup)) + if not res: + _unexpected_html_warning() + log.warn_contd(f"Could not find click handler target for {button}") + continue + url = self._abs_url_from_relative(res.group(1)) + name = _sanitize_path_name(button.getText().strip()) + type = self._find_type_from_card(button) + caption_parent = button.findParent( + "div", + attrs={"class": lambda x: x and "caption" in x}, + ) + description = caption_parent.find_next_sibling("div").getText().strip() + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {button}") + continue + + result.append(IliasPageElement(type, url, name, description=description)) + + return result + + def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: + def is_card_root(element: Tag) -> bool: + return "il-card" in element["class"] and "thumbnail" in element["class"] + + card_root: Optional[Tag] = None + + # We look for the card root + for parent in card_title.parents: + if is_card_root(parent): + card_root = parent + break + + if card_root is None: + _unexpected_html_warning() + log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}") + return None + + icon: Tag = card_root.select_one(".il-card-repository-head .icon") + + if "opencast" in icon["class"]: + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED + if "exc" in icon["class"]: + return IliasElementType.EXERCISE + if "webr" in icon["class"]: + return IliasElementType.LINK + if "book" in icon["class"]: + return IliasElementType.BOOKING + if "frm" in icon["class"]: + return IliasElementType.FORUM + if "sess" in icon["class"]: + return IliasElementType.MEETING + if "tst" in icon["class"]: + return IliasElementType.TEST + if "fold" in icon["class"]: + return IliasElementType.FOLDER + + _unexpected_html_warning() + log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") + return None + @staticmethod def _find_type_from_link( element_name: str, @@ -550,7 +636,13 @@ class IliasPage: """ Create an absolute url from an tag. """ - return urljoin(self._page_url, link_tag.get("href")) + return self._abs_url_from_relative(link_tag.get("href")) + + def _abs_url_from_relative(self, relative_url: str) -> str: + """ + Create an absolute url from a relative URL. + """ + return urljoin(self._page_url, relative_url) def _unexpected_html_warning() -> None: From ad3f4955f72a6bfbdcbaaae24b821f078e6e44d5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 30 Oct 2021 18:14:39 +0200 Subject: [PATCH 031/224] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a90c978..faa2507 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ ambiguous situations. ### Added - A KIT IPD crawler +- Support for ILIAS cards +- Support for multi-stream videos ### Removed - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file From d6f38a61e16fa95d8a2365abc1cfd70f35ee0289 Mon Sep 17 00:00:00 2001 From: Toorero <22551563+Toorero@users.noreply.github.com> Date: Mon, 25 Oct 2021 21:34:51 +0200 Subject: [PATCH 032/224] Fixed minor spelling mistakes --- CONFIG.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 4d2ec33..8ccaa50 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -36,7 +36,7 @@ Sections whose names start with `crawl:` are used to configure crawlers. The rest of the section name specifies the name of the crawler. A crawler synchronizes a remote resource to a local directory. There are -different types of crawlers for different kinds of resources, e. g. ILIAS +different types of crawlers for different kinds of resources, e.g. ILIAS courses or lecture websites. Each crawl section represents an instance of a specific type of crawler. The @@ -53,7 +53,7 @@ common to all crawlers: crawler can still be executed manually using the `--crawler` or `-C` flags. (Default: `no`) - `output_dir`: The directory the crawler synchronizes files to. A crawler will - never place any files outside of this directory. (Default: the crawler's name) + never place any files outside this directory. (Default: the crawler's name) - `redownload`: When to download a file that is already present locally. (Default: `never-smart`) - `never`: If a file is present locally, it is not downloaded again. @@ -138,7 +138,7 @@ crawler simulate a slower, network-based crawler. ### The `kit-ipd` crawler -This crawler crals a KIT ipd page by url. The root page can be crawled from +This crawler crawls a KIT ipd page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, but downloading files requires you to be within. Adding a show delay between requests is likely a good idea. @@ -312,11 +312,11 @@ matches `SOURCE`, the output path is created using `TARGET` as template. `SOURCE` is automatically anchored. `TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can -be referred to as `{g}` (e. g. `{g3}`). `{g0}` refers to the original path. +be referred to as `{g}` (e.g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is -available as `{i}` (e. g. `{i3}`). If capturing group *n*'s contents are a -valid float, the float value is available as `{f}` (e. g. `{f3}`). If a -capturing group is not present (e. g. when matching the string `cd` with the +available as `{i}` (e.g. `{i3}`). If capturing group *n*'s contents are a +valid float, the float value is available as `{f}` (e.g. `{f3}`). If a +capturing group is not present (e.g. when matching the string `cd` with the regex `(ab)?cd`), the corresponding variables are not defined. Python's format string syntax has rich options for formatting its arguments. For From 6b2a65757373193a5ecb8d2263ae7d758178014d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julius=20R=C3=BCberg?= <22551563+Toorero@users.noreply.github.com> Date: Mon, 1 Nov 2021 10:09:50 +0100 Subject: [PATCH 033/224] Fix IPD crawler for different subpages (#42) This patch reworks the IPD crawler to support subpages which do not use "/intern" for links and fetches the folder names from table headings. --- PFERD/crawl/kit_ipd_crawler.py | 50 ++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 4d4addd..1ed5ffe 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,7 +1,9 @@ import os +import re from dataclasses import dataclass from pathlib import PurePath -from typing import List, Set, Union +from re import Pattern +from typing import List, Set, Union, AnyStr, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -25,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target + def link_regex(self) -> Pattern[AnyStr]: + regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$") + return re.compile(regex) + @dataclass class KitIpdFile: @@ -48,6 +54,7 @@ class KitIpdCrawler(HttpCrawler): ): super().__init__(name, section, config) self._url = section.target() + self._file_regex = section.link_regex() async def _run(self) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -88,19 +95,28 @@ class KitIpdCrawler(HttpCrawler): folder_tags: Set[Tag] = set() for element in elements: - enclosing_data: Tag = element.findParent(name="td") - label: Tag = enclosing_data.findPreviousSibling(name="td") - folder_tags.add(label) + folder_label = self._fetch_folder_label(element) + if folder_label is None: + folder_tags.add(page) + else: + folder_tags.add(folder_label) return folder_tags def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: - name = folder_tag.getText().strip() files: List[KitIpdFile] = [] + # if files have found outside a regular table + if not folder_tag.name.startswith("h"): + name = "." + root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag)) + for link in root_links: + files.append(self._extract_file(link)) - container: Tag = folder_tag.findNextSibling(name="td") - for link in self._find_file_links(container): - files.append(self._extract_file(link)) + else: + name = folder_tag.getText().strip() + container: Tag = folder_tag.findNextSibling(name="table") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) log.explain_topic(f"Found folder {name!r}") for file in files: @@ -108,14 +124,24 @@ class KitIpdCrawler(HttpCrawler): return KitIpdFolder(name, files) + @staticmethod + def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: + enclosing_table: Tag = file_link.findParent(name="table") + if enclosing_table is None: + return None + label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) + if label is None: + return None + else: + return label + def _extract_file(self, link: Tag) -> KitIpdFile: - name = link.getText().strip() url = self._abs_url_from_link(link) - _, extension = os.path.splitext(url) - return KitIpdFile(name + extension, url) + name = os.path.basename(url) + return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: - return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x}) + return tag.findAll(name="a", attrs={"href": self._file_regex}) def _abs_url_from_link(self, link_tag: Tag) -> str: return urljoin(self._url, link_tag.get("href")) From 88afe64a928fce7108264f386298edbbe60117f5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 1 Nov 2021 10:43:13 +0100 Subject: [PATCH 034/224] Refactor IPD crawler a bit --- PFERD/cli/command_kit_ipd.py | 2 +- PFERD/crawl/kit_ipd_crawler.py | 75 +++++++++++++++++----------------- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index 480cc9b..c4c593f 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -35,7 +35,7 @@ def load( log.explain("Creating config for command 'kit-ipd'") parser["crawl:kit-ipd"] = {} - section = parser["crawl:ipd"] + section = parser["crawl:kit-ipd"] load_crawler(args, section) section["type"] = "kit-ipd" diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 1ed5ffe..76145b4 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from pathlib import PurePath from re import Pattern -from typing import List, Set, Union, AnyStr, Optional +from typing import Awaitable, List, Optional, Set, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -27,12 +27,12 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target - def link_regex(self) -> Pattern[AnyStr]: - regex = self.s.get("link_regex", "^.*/[^/]*\.(?:pdf|zip|c|java)$") + def link_regex(self) -> Pattern[str]: + regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$") return re.compile(regex) -@dataclass +@dataclass(unsafe_hash=True) class KitIpdFile: name: str url: str @@ -43,6 +43,14 @@ class KitIpdFolder: name: str files: List[KitIpdFile] + def explain(self) -> None: + log.explain_topic(f"Folder {self.name!r}") + for file in self.files: + log.explain(f"File {file.name!r}") + + def __hash__(self) -> int: + return self.name.__hash__() + class KitIpdCrawler(HttpCrawler): @@ -61,13 +69,15 @@ class KitIpdCrawler(HttpCrawler): if not maybe_cl: return - folders: List[KitIpdFolder] = [] + tasks: List[Awaitable[None]] = [] async with maybe_cl: - folder_tags = await self._fetch_folder_tags() - folders = [self._extract_folder(tag) for tag in folder_tags] - - tasks = [self._crawl_folder(folder) for folder in folders] + for item in await self._fetch_items(): + if isinstance(item, KitIpdFolder): + tasks.append(self._crawl_folder(item)) + else: + # Orphan files are placed in the root folder + tasks.append(self._download_file(PurePath("."), item)) await self.gather(tasks) @@ -89,51 +99,42 @@ class KitIpdCrawler(HttpCrawler): async with maybe_dl as (bar, sink): await self._stream_from_url(file.url, sink, bar) - async def _fetch_folder_tags(self) -> Set[Tag]: + async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: page = await self.get_page() elements: List[Tag] = self._find_file_links(page) - folder_tags: Set[Tag] = set() + items: Set[Union[KitIpdFile, KitIpdFolder]] = set() for element in elements: - folder_label = self._fetch_folder_label(element) - if folder_label is None: - folder_tags.add(page) + folder_label = self._find_folder_label(element) + if folder_label: + folder = self._extract_folder(folder_label) + if folder not in items: + items.add(folder) + folder.explain() else: - folder_tags.add(folder_label) + file = self._extract_file(element) + items.add(file) + log.explain_topic(f"Orphan file {file.name!r}") + log.explain("Attributing it to root folder") - return folder_tags + return items def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: files: List[KitIpdFile] = [] - # if files have found outside a regular table - if not folder_tag.name.startswith("h"): - name = "." - root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag)) - for link in root_links: - files.append(self._extract_file(link)) + name = folder_tag.getText().strip() - else: - name = folder_tag.getText().strip() - container: Tag = folder_tag.findNextSibling(name="table") - for link in self._find_file_links(container): - files.append(self._extract_file(link)) - - log.explain_topic(f"Found folder {name!r}") - for file in files: - log.explain(f"Found file {file.name!r}") + container: Tag = folder_tag.findNextSibling(name="table") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) return KitIpdFolder(name, files) @staticmethod - def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: + def _find_folder_label(file_link: Tag) -> Optional[Tag]: enclosing_table: Tag = file_link.findParent(name="table") if enclosing_table is None: return None - label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) - if label is None: - return None - else: - return label + return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) def _extract_file(self, link: Tag) -> KitIpdFile: url = self._abs_url_from_link(link) From 13b8c3d9c6c59ab2714e2670506d89c5a2cb6eb6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 2 Nov 2021 09:30:46 +0100 Subject: [PATCH 035/224] Add regex option to config and CLI parser --- CONFIG.md | 7 ++++++- LICENSE | 3 ++- PFERD/cli/command_kit_ipd.py | 8 ++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index 8ccaa50..569780d 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -138,11 +138,16 @@ crawler simulate a slower, network-based crawler. ### The `kit-ipd` crawler -This crawler crawls a KIT ipd page by url. The root page can be crawled from +This crawler crawls a KIT-IPD page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, but downloading files requires you to be within. Adding a show delay between requests is likely a good idea. +- `target`: URL to a KIT-IPD page +- `link_regex`: A regex that is matched against the `href` part of links. If it + matches, the given link is downloaded as a file. This is used to extract + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + ### The `kit-ilias-web` crawler This crawler crawls the KIT ILIAS instance. diff --git a/LICENSE b/LICENSE index c096c4a..fe2293f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ -Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst +Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, + TheChristophe, Scriptim, thelukasprobst, Toorero Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/cli/command_kit_ipd.py b/PFERD/cli/command_kit_ipd.py index c4c593f..b53e67e 100644 --- a/PFERD/cli/command_kit_ipd.py +++ b/PFERD/cli/command_kit_ipd.py @@ -14,6 +14,12 @@ GROUP = SUBPARSER.add_argument_group( title="kit ipd crawler arguments", description="arguments for the 'kit-ipd' crawler", ) +GROUP.add_argument( + "--link-regex", + type=str, + metavar="REGEX", + help="href-matching regex to identify downloadable files" +) GROUP.add_argument( "target", type=str, @@ -41,6 +47,8 @@ def load( section["type"] = "kit-ipd" section["target"] = str(args.target) section["output_dir"] = str(args.output) + if args.link_regex: + section["link_regex"] = str(args.link_regex) SUBPARSER.set_defaults(command=load) From 6289938d7c772660a5d497ce456168186eb8a6fb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Nov 2021 12:09:51 +0100 Subject: [PATCH 036/224] Do not stop crawling files when encountering a CrawlWarning --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index f483754..c3e51ef 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -84,7 +84,7 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) -def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: +def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: async def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: last_exception: Optional[BaseException] = None @@ -105,7 +105,10 @@ def _iorepeat(attempts: int, name: str) -> Callable[[AWrapped], AWrapped]: if last_exception: message = f"Error in I/O Operation: {last_exception}" - raise CrawlWarning(message) from last_exception + if failure_is_error: + raise CrawlError(message) from last_exception + else: + raise CrawlWarning(message) from last_exception raise CrawlError("Impossible return in ilias _iorepeat") return wrapper # type: ignore @@ -251,6 +254,7 @@ instance's greatest bottleneck. return None return self._crawl_ilias_page(url, parent, maybe_cl) + @anoncritical async def _crawl_ilias_page( self, url: str, @@ -292,10 +296,12 @@ instance's greatest bottleneck. # And execute them await self.gather(tasks) + # These decorators only apply *to this method* and *NOT* to the returned + # awaitables! + # This method does not await the handlers but returns them instead. + # This ensures one level is handled at a time and name deduplication + # works correctly. @anoncritical - # Shouldn't happen but we also really don't want to let I/O errors bubble up to anoncritical. - # If that happens we will be terminated as anoncritical doesn't tream them as non-critical. - @_wrap_io_in_warning("handling ilias element") async def _handle_ilias_element( self, parent_path: PurePath, @@ -363,6 +369,7 @@ instance's greatest bottleneck. return self._download_link(element, link_template_maybe, maybe_dl) + @anoncritical @_iorepeat(3, "resolving link") async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async with dl as (bar, sink): @@ -409,6 +416,7 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking( self, @@ -488,6 +496,7 @@ instance's greatest bottleneck. log.explain("Missing at least one video, continuing with requests!") return False + @anoncritical @_iorepeat(3, "downloading video") async def _download_video( self, @@ -534,6 +543,7 @@ instance's greatest bottleneck. return None return self._download_file(element, maybe_dl) + @anoncritical @_iorepeat(3, "downloading file") async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: assert dl # The function is only reached when dl is not None @@ -589,7 +599,7 @@ instance's greatest bottleneck. # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login") + @_iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) From 90cb6e989b492bbfe2f242c77aad616b86637052 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 6 Nov 2021 23:20:24 +0100 Subject: [PATCH 037/224] Do not download single videos if cache does not exist --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c3e51ef..c6115f4 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -514,7 +514,12 @@ instance's greatest bottleneck. else: log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] - await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + + # We do not have a local cache yet + if self._output_dir.resolve(original_path).exists(): + log.explain(f"Video for {element.name} existed locally") + else: + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) self.report.add_custom_value(str(original_path), [original_path.name]) return From a82a0b19c2193c6817ae07361889de8fd392868e Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 7 Nov 2021 21:40:22 +0100 Subject: [PATCH 038/224] Collect crawler warnings/errors and include them in the report --- PFERD/crawl/crawler.py | 8 ++++++-- PFERD/pferd.py | 8 ++++++++ PFERD/report.py | 24 +++++++++++++++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index c492ee9..53f43e9 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -47,10 +47,12 @@ def noncritical(f: Wrapped) -> Wrapped: try: f(*args, **kwargs) except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: + crawler.report.add_warning(str(e)) log.warn(str(e)) crawler.error_free = False - except: # noqa: E722 do not use bare 'except' + except Exception as e: crawler.error_free = False + crawler.report.add_error(str(e)) raise return wrapper # type: ignore @@ -83,8 +85,10 @@ def anoncritical(f: AWrapped) -> AWrapped: except (CrawlWarning, OutputDirError, MarkDuplicateError, MarkConflictError) as e: log.warn(str(e)) crawler.error_free = False - except: # noqa: E722 do not use bare 'except' + crawler.report.add_warning(str(e)) + except Exception as e: crawler.error_free = False + crawler.report.add_error(str(e)) raise return None diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 726ed45..079053b 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -182,5 +182,13 @@ class Pferd: something_changed = True log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + for warning in crawler.report.encountered_warnings: + something_changed = True + log.report(f" [bold bright_red]Warning[/] {warning}") + + for error in crawler.report.encountered_errors: + something_changed = True + log.report(f" [bold bright_red]Error[/] {error}") + if not something_changed: log.report(" Nothing changed") diff --git a/PFERD/report.py b/PFERD/report.py index 99a4661..0e0c789 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -67,8 +67,14 @@ class Report: self.deleted_files: Set[PurePath] = set() # Files that should have been deleted by the cleanup but weren't self.not_deleted_files: Set[PurePath] = set() + + # Custom crawler-specific data self.custom: Dict[str, Any] = dict() + # Encountered errors and warnings + self.encountered_warnings: List[str] = [] + self.encountered_errors: List[str] = [] + @staticmethod def _get_list_of_strs(data: Dict[str, Any], key: str) -> List[str]: result: Any = data.get(key, []) @@ -119,6 +125,8 @@ class Report: for elem in self._get_list_of_strs(data, "not_deleted"): self.not_delete_file(PurePath(elem)) self.custom = self._get_str_dictionary(data, "custom") + self.encountered_errors = self._get_list_of_strs(data, "encountered_errors") + self.encountered_warnings = self._get_list_of_strs(data, "encountered_warnings") return self @@ -135,7 +143,9 @@ class Report: "changed": [str(path) for path in sorted(self.changed_files)], "deleted": [str(path) for path in sorted(self.deleted_files)], "not_deleted": [str(path) for path in sorted(self.not_deleted_files)], - "custom": self.custom + "custom": self.custom, + "encountered_warnings": self.encountered_warnings, + "encountered_errors": self.encountered_errors, } with open(path, "w") as f: @@ -214,3 +224,15 @@ class Report: Retrieves a custom value for the given key. """ return self.custom.get(key) + + def add_error(self, error: str) -> None: + """ + Adds an error to this report's error list. + """ + self.encountered_errors.append(error) + + def add_warning(self, warning: str) -> None: + """ + Adds a warning to this report's warning list. + """ + self.encountered_warnings.append(warning) From eac2e341612461987d37314110c3f4c7640499f3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 7 Jan 2022 23:32:31 +0100 Subject: [PATCH 039/224] Fix is_logged_in for ILIAS 7 --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c6115f4..c5b2953 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -611,9 +611,10 @@ instance's greatest bottleneck. @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages - userlog = soup.find("li", {"id": "userlog"}) - if userlog is not None: - return True + mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + if mainbar is not None: + login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) + return not login_button # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find( From a99356f2a2d403ffb40f47bb159707d73e55a0e3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 00:27:34 +0100 Subject: [PATCH 040/224] Fix video stream extraction --- PFERD/crawl/ilias/kit_ilias_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d8c347d..ece88c5 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -133,7 +133,7 @@ class IliasPage: # parse it json_object = json.loads(json_str) - streams = [stream for stream in json_object["streams"] if stream["type"] == "video"] + streams = [stream for stream in json_object["streams"]] # and just fetch the lone video url! if len(streams) == 1: From 462d993fbc00602b4952d675fa4c77e5372c27fa Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 00:27:48 +0100 Subject: [PATCH 041/224] Fix local video path cache (hopefully) --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c5b2953..5d44566 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -489,7 +489,10 @@ instance's greatest bottleneck. log.explain_topic(f"Checking local cache for video {video_path.name}") all_found_locally = True for video in contained_videos: - all_found_locally = all_found_locally and self._output_dir.resolve(video).exists() + transformed_path = self._transformer.transform(video) + if transformed_path: + exists_locally = self._output_dir.resolve(transformed_path).exists() + all_found_locally = all_found_locally and exists_locally if all_found_locally: log.explain("Found all videos locally, skipping enumeration request") return True @@ -515,8 +518,12 @@ instance's greatest bottleneck. log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] + transformed_path = self._transformer.transform(original_path) + if not transformed_path: + raise CrawlError(f"Download returned a path but transform did not for {original_path}") + # We do not have a local cache yet - if self._output_dir.resolve(original_path).exists(): + if self._output_dir.resolve(transformed_path).exists(): log.explain(f"Video for {element.name} existed locally") else: await self._stream_from_url(stream_element.url, sink, bar, is_video=True) @@ -526,8 +533,8 @@ instance's greatest bottleneck. contained_video_paths: List[str] = [] for stream_element in stream_elements: - contained_video_paths.append(stream_element.name) video_path = original_path.parent / stream_element.name + contained_video_paths.append(str(video_path)) maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) if not maybe_dl: From 6f3cfd43969cdac557c4f2d38bd2b4f0ffd40721 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 16:58:15 +0100 Subject: [PATCH 042/224] Fix personal desktop crawling --- PFERD/crawl/ilias/kit_ilias_html.py | 61 ++++++++++++++++++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ece88c5..9c8ab95 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -39,7 +39,12 @@ class IliasPageElement: description: Optional[str] = None def id(self) -> str: - regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + regexes = [ + r"eid=(?P[0-9a-z\-]+)", + r"file_(?P\d+)", + r"ref_id=(?P\d+)", + r"target=[a-z]+_(?P\d+)" + ] for regex in regexes: if match := re.search(regex, self.url): @@ -71,6 +76,9 @@ class IliasPage: if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() + if self._is_personal_desktop(): + log.explain("Page is the personal desktop") + return self._find_personal_desktop_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -115,6 +123,9 @@ class IliasPage: return False + def _is_personal_desktop(self) -> bool: + return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -149,6 +160,26 @@ class IliasPage: return items + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + + titles: List[Tag] = self._soup.select(".il-item-title") + for title in titles: + link = title.find("a") + name = _sanitize_path_name(link.text.strip()) + url = self._abs_url_from_link(link) + + type = self._find_type_from_link(name, link, url) + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {link}") + continue + + log.explain(f"Found {name!r}") + items.append(IliasPageElement(type, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing @@ -551,9 +582,30 @@ class IliasPage: if "target=file_" in parsed_url.query: return IliasElementType.FILE + if "target=grp_" in parsed_url.query: + return IliasElementType.FOLDER + + if "target=crs_" in parsed_url.query: + return IliasElementType.FOLDER + + if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: + return IliasElementType.EXERCISE + + if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: + return IliasElementType.LINK + + if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: + return IliasElementType.FORUM + + if "cmdClass=ilobjtestgui" in parsed_url.query: + return IliasElementType.TEST + + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so + # try to guess it from the image. + # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc - if "ref_id=" in parsed_url.query: + if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: return IliasPage._find_type_from_folder_like(link_element, url) _unexpected_html_warning() @@ -574,7 +626,7 @@ class IliasPage: # We look for the outer div of our inner link, to find information around it # (mostly the icon) for parent in link_element.parents: - if "ilContainerListItemOuter" in parent["class"]: + if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]: found_parent = parent break @@ -586,6 +638,9 @@ class IliasPage: # Find the small descriptive icon to figure out the type img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") + if img_tag is None: + img_tag = found_parent.select_one("img.icon") + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5d44566..99d6cf6 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -203,7 +203,9 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - await self._crawl_url(self._base_url) + appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" + appendix = appendix.encode("ASCII").hex() + await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -622,6 +624,11 @@ instance's greatest bottleneck. if mainbar is not None: login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) return not login_button + + # Personal Desktop + if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + return True + # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find( From ced8b9a2d032e7e4956b331d4408cb4b0829c780 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 16:58:30 +0100 Subject: [PATCH 043/224] Fix some accordions --- PFERD/crawl/ilias/kit_ilias_html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9c8ab95..0a81222 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -428,7 +428,10 @@ class IliasPage: continue prev: Tag = parent.findPreviousSibling("div") if "ilContainerBlockHeader" in prev.get("class"): - found_titles.append(prev.find("h3").getText().strip()) + if prev.find("h3"): + found_titles.append(prev.find("h3").getText().strip()) + else: + found_titles.append(prev.find("h2").getText().strip()) # And this for real accordions if "il_VAccordionContentDef" in parent.get("class"): From 5f527bc697b58512520f4d8ff93b856ff3a345b1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 17:14:40 +0100 Subject: [PATCH 044/224] Remove Python 3.9 Pattern typehints --- PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/kit_ipd_crawler.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 0a81222..78bedbf 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -132,7 +132,7 @@ class IliasPage: # on the page, but defined in a JS object inside a script tag, passed to the player # library. # We do the impossible and RegEx the stream JSON object out of the page's HTML source - regex: re.Pattern[str] = re.compile( + regex = re.compile( r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE ) json_match = regex.search(str(self._soup)) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 76145b4..1a5314b 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,8 +2,7 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from re import Pattern -from typing import Awaitable, List, Optional, Set, Union +from typing import Awaitable, List, Optional, Pattern, Set, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag From e32c1f000fb9abcc47f8dc127b4d674acfa1662c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 18:05:48 +0100 Subject: [PATCH 045/224] Fix mtime for single streams --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 99d6cf6..c4e70c0 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -461,7 +461,7 @@ instance's greatest bottleneck. # is re-used if the video consists of a single stream. In that case the # file name is used and *not* the stream name the ilias html parser reported # to ensure backwards compatibility. - maybe_dl = await self.download(element_path, redownload=Redownload.ALWAYS) + maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) # If we do not want to crawl it (user filter) or we have every file # from the cached mapping already, we can ignore this and bail From eb4de8ae0cc37e38e9fa801f729e68d1f71a0bb0 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 18:14:43 +0100 Subject: [PATCH 046/224] Ignore 1970 dates as windows crashes when calling .timestamp() --- PFERD/output_dir.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 0fb9911..e612267 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -231,7 +231,8 @@ class OutputDirectory: stat = local_path.stat() remote_newer = None - if mtime := heuristics.mtime: + if heuristics.mtime and heuristics.mtime.year > 1970: + mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: log.explain("Remote file seems to be newer") From 43c5453e100aedede844a242721d2990845c2c26 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 19:59:42 +0100 Subject: [PATCH 047/224] Correctly crawl files on desktop The files on the desktop do not include a download link, so we need to rewrite it. --- PFERD/crawl/ilias/kit_ilias_html.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 78bedbf..cee0555 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -176,6 +176,11 @@ class IliasPage: continue log.explain(f"Found {name!r}") + + if type == IliasElementType.FILE and "_download" not in url: + url = re.sub(r"(target=file_\d+)", r"\1_download", url) + log.explain("Rewired file URL to include download part") + items.append(IliasPageElement(type, url, name)) return items From 10d9d7452809aafe4f406f894944a078072f16bf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:28:30 +0100 Subject: [PATCH 048/224] Bail out when crawling recursive courses --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c4e70c0..8f78e7a 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -182,6 +182,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._visited_urls: Set[str] = set() async def _run(self) -> None: if isinstance(self._target, int): @@ -309,6 +310,12 @@ instance's greatest bottleneck. parent_path: PurePath, element: IliasPageElement, ) -> Optional[Awaitable[None]]: + if element.url in self._visited_urls: + raise CrawlWarning( + f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + ) + self._visited_urls.add(element.url) + element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: From d30f25ee9788d3363544ba9779cabf157dba3b98 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:28:45 +0100 Subject: [PATCH 049/224] Detect shib login page as login page And do not assume we are logged in... --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 8f78e7a..c3b2342 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -630,7 +630,8 @@ instance's greatest bottleneck. mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) - return not login_button + shib_login = soup.find(id="button_shib_login") + return not login_button and not shib_login # Personal Desktop if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): From 4ee919625da8d3d04cbb889e24d05b1c09436fe8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:47:35 +0100 Subject: [PATCH 050/224] Add rudimentary support for content pages --- PFERD/crawl/ilias/kit_ilias_html.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index cee0555..754af16 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -77,8 +77,11 @@ class IliasPage: log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() if self._is_personal_desktop(): - log.explain("Page is the personal desktop") + log.explain("Page is the personal desktop, searching for elements") return self._find_personal_desktop_entries() + if self._is_content_page(): + log.explain("Page is a content page, searching for elements") + return self._find_copa_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -126,6 +129,12 @@ class IliasPage: def _is_personal_desktop(self) -> bool: return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _is_content_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=copa_" in link.get("value") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -185,6 +194,23 @@ class IliasPage: return items + def _find_copa_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") + + for link in links: + url = self._abs_url_from_link(link) + name = _sanitize_path_name(link.getText().strip().replace("\t", "")) + + if "file_id" not in url: + _unexpected_html_warning() + log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") + continue + + items.append(IliasPageElement(IliasElementType.FILE, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing From 4bf0c972e6e37afc7f9688104082189f5f78d390 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 11:47:59 +0100 Subject: [PATCH 051/224] Update types for rich 11 --- PFERD/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index 32e5268..e2d64fc 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -5,7 +5,7 @@ from contextlib import asynccontextmanager, contextmanager # TODO In Python 3.9 and above, ContextManager is deprecated from typing import AsyncIterator, ContextManager, Iterator, List, Optional -from rich.console import Console, RenderGroup +from rich.console import Console, Group from rich.live import Live from rich.markup import escape from rich.panel import Panel @@ -68,7 +68,7 @@ class Log: if self._download_progress.task_ids: elements.append(self._download_progress) - group = RenderGroup(*elements) # type: ignore + group = Group(*elements) # type: ignore self._live.update(group) @contextmanager From e9d2d0503001728f6c1f313982d8843d83405e3d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 11:39:42 +0100 Subject: [PATCH 052/224] Update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index faa2507..1b392c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,10 +26,16 @@ ambiguous situations. - A KIT IPD crawler - Support for ILIAS cards - Support for multi-stream videos +- Support for ILIAS 7 ### Removed - [Interpolation](https://docs.python.org/3/library/configparser.html#interpolation-of-values) in config file +### Fixed +- Crawling of recursive courses +- Crawling files directly placed on the personal desktop +- Ignore timestamps at the unix epoch as they crash on windows + ## 3.2.0 - 2021-08-04 ### Added From e467b38d739347d62cbb122d9f4752abe823b423 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:23:00 +0100 Subject: [PATCH 053/224] Only reject 1970 timestamps on windows --- PFERD/output_dir.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index e612267..441717b 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -231,7 +231,9 @@ class OutputDirectory: stat = local_path.stat() remote_newer = None - if heuristics.mtime and heuristics.mtime.year > 1970: + + # Python on Windows crashes when faced with timestamps around the unix epoch + if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: From 33453ede2d63b15bcca2ce541af2299440bfa8ff Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:31:42 +0100 Subject: [PATCH 054/224] Update dependency versions in setup.py --- setup.cfg | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5758282..059798a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,11 +6,11 @@ version = attr: PFERD.version.VERSION packages = find: python_requires = >=3.8 install_requires = - aiohttp>=3.7.4.post0 - beautifulsoup4>=4.9.3 - rich>=10.1.0 - keyring>=23.0.1 - certifi>=2020.12.5 + aiohttp>=3.8.1 + beautifulsoup4>=4.10.0 + rich>=11.0.0 + keyring>=23.5.0 + certifi>=2021.10.8 [options.entry_points] console_scripts = From 9618aae83bf10b8e517c53a53c47d14dd707c707 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 9 Jan 2022 18:32:58 +0100 Subject: [PATCH 055/224] Add content pages to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b392c1..6e4c7e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - A KIT IPD crawler - Support for ILIAS cards +- (Rudimentary) support for content pages - Support for multi-stream videos - Support for ILIAS 7 From 0045124a4e2851d4d1d84bc7c2b68c75f49d5375 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 9 Jan 2022 21:09:09 +0100 Subject: [PATCH 056/224] Bump version to 3.3.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e4c7e9..132351b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.3.0 - 2022-01-09 + ### Added - A KIT IPD crawler - Support for ILIAS cards diff --git a/PFERD/version.py b/PFERD/version.py index b8efadd..ca58f3a 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.2.0" +VERSION = "3.3.0" From 57ec51e95a238960d1832ba0ad85b2ff6ec1de3b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 14 Jan 2022 20:15:19 +0100 Subject: [PATCH 057/224] Fix login after shib url parser change --- CHANGELOG.md | 4 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 39 +++++++++++++++++++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 132351b..41ee3d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ ambiguous situations. ## Unreleased +### Fixed +- Shibboleth login fixed. It was broken due to URL parser changes and really + *unfortunate* behaviour by aiohttp. + ## 3.3.0 - 2022-01-09 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c3b2342..c26ce8b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -4,6 +4,7 @@ from pathlib import PurePath from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast import aiohttp +import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag @@ -674,14 +675,14 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/Shibboleth.sso/Login" + url = "https://ilias.studium.kit.edu/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "target": "/shib_login.php", - "home_organization_selection": "Mit KIT-Account anmelden", + "il_target": "", + "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _post(sess, url, data) + soup: BeautifulSoup = await _shib_post(sess, url, data) # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -761,3 +762,33 @@ class KitShibbolethLogin: async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: async with session.post(url, data=data) as response: return soupify(await response.read()) + + +async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: + """ + aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected + by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and + build encoded URL objects ourselfs... Who thought mangling location header was a good idea?? + """ + async with session.post(url, data=data, allow_redirects=False) as response: + location = response.headers.get("location") + if not location: + raise CrawlWarning(f"Login failed, no location header present at {url}") + correct_url = yarl.URL(location, encoded=True) + + async with session.get(correct_url, allow_redirects=False) as response: + as_yarl = yarl.URL(response.url) + location = response.headers.get("location") + + if not location or not as_yarl.host: + raise CrawlWarning(f"Login failed, no location header present at {correct_url}") + + correct_url = yarl.URL.build( + scheme=as_yarl.scheme, + host=as_yarl.host, + path=location, + encoded=True + ) + + async with session.get(correct_url, allow_redirects=False) as response: + return soupify(await response.read()) From f47e7374d23b71396b511ee7b57f59d46c34e00d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 14 Jan 2022 22:01:45 +0100 Subject: [PATCH 058/224] Use fixed windows path for video cache --- CHANGELOG.md | 4 +++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 +++++++-- PFERD/deduplicator.py | 6 ++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41ee3d5..7f35a90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,8 +23,10 @@ ambiguous situations. ## Unreleased ### Fixed -- Shibboleth login fixed. It was broken due to URL parser changes and really +- Shibboleth login. It was broken due to URL parser changes and really *unfortunate* behaviour by aiohttp. +- local video cache on windows if the path was changed to accomodate windows + file system limitations (e.g. replace `:`) ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c26ce8b..b197b6b 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -499,7 +499,7 @@ instance's greatest bottleneck. log.explain_topic(f"Checking local cache for video {video_path.name}") all_found_locally = True for video in contained_videos: - transformed_path = self._transformer.transform(video) + transformed_path = self._to_local_video_path(video) if transformed_path: exists_locally = self._output_dir.resolve(transformed_path).exists() all_found_locally = all_found_locally and exists_locally @@ -509,6 +509,11 @@ instance's greatest bottleneck. log.explain("Missing at least one video, continuing with requests!") return False + def _to_local_video_path(self, path: PurePath) -> Optional[PurePath]: + if transformed := self._transformer.transform(path): + return self._deduplicator.fixup_path(transformed) + return None + @anoncritical @_iorepeat(3, "downloading video") async def _download_video( @@ -528,7 +533,7 @@ instance's greatest bottleneck. log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] - transformed_path = self._transformer.transform(original_path) + transformed_path = self._to_local_video_path(original_path) if not transformed_path: raise CrawlError(f"Download returned a path but transform did not for {original_path}") diff --git a/PFERD/deduplicator.py b/PFERD/deduplicator.py index ef62dcb..7777f28 100644 --- a/PFERD/deduplicator.py +++ b/PFERD/deduplicator.py @@ -56,6 +56,12 @@ class Deduplicator: log.explain(f"Changed path to {fmt_path(new_path)} for windows compatibility") return new_path + def fixup_path(self, path: PurePath) -> PurePath: + """Fixes up the path for windows, if enabled. Returns the path unchanged otherwise.""" + if self._windows_paths: + return self._fixup_for_windows(path) + return path + def mark(self, path: PurePath) -> PurePath: if self._windows_paths: path = self._fixup_for_windows(path) From 4f022e2d192552ddef22b169044f2692bc4e1563 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 15:06:02 +0100 Subject: [PATCH 059/224] Reword changelog --- CHANGELOG.md | 6 ++---- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f35a90..76cf836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,10 +23,8 @@ ambiguous situations. ## Unreleased ### Fixed -- Shibboleth login. It was broken due to URL parser changes and really - *unfortunate* behaviour by aiohttp. -- local video cache on windows if the path was changed to accomodate windows - file system limitations (e.g. replace `:`) +- ILIAS login +- Local video cache if `windows_paths` is enabled ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index b197b6b..a3e37a9 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -773,7 +773,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea """ aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and - build encoded URL objects ourselfs... Who thought mangling location header was a good idea?? + build encoded URL objects ourselves... Who thought mangling location header was a good idea?? """ async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") From 86947e4874f0853444e38de0fac4d2ddab5ae41e Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 15:11:22 +0100 Subject: [PATCH 060/224] Bump version to 3.3.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76cf836..d5f9dc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.3.1 - 2022-01-15 + ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled diff --git a/PFERD/version.py b/PFERD/version.py index ca58f3a..37e91f3 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.3.0" +VERSION = "3.3.1" From 7872fe5221c4c8b95b59ffe54f879c1c39e736f3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 18 Jan 2022 22:32:43 +0100 Subject: [PATCH 061/224] Fix tables with more columns than expected --- PFERD/crawl/ilias/kit_ilias_html.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 754af16..94b2e4b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -280,11 +280,22 @@ class IliasPage: def _listed_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. - # 6th child (1 indexed) is the modification time string - modification_string = link.parent.parent.parent.select_one( - "td.std:nth-child(6)" - ).getText().strip() - modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + # 6th or 7th child (1 indexed) is the modification time string. Try to find it + # by parsing backwards from the end and finding something that looks like a date + modification_time = None + row: Tag = link.parent.parent.parent + column_count = len(row.select("td.std")) + for index in range(column_count, 0, -1): + modification_string = link.parent.parent.parent.select_one( + f"td.std:nth-child({index})" + ).getText().strip() + if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): + modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + break + + if modification_time is None: + log.warn(f"Could not determine upload time for {link}") + modification_time = datetime.now() title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() title += ".mp4" From 86e2e226dcefb98232410cc2289d11a664076adc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 3 Apr 2022 11:32:38 +0200 Subject: [PATCH 062/224] Notify user when shibboleth presents new entitlements --- CHANGELOG.md | 2 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5f9dc6..4e11224 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ ambiguous situations. ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled +- Report when Shibboleth reviews entitlements +- Support for video listings with more columns ## 3.3.0 - 2022-01-09 diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index a3e37a9..2a5fc87 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -710,6 +710,12 @@ class KitShibbolethLogin: } soup = await _post(sess, url, data) + if soup.find(id="attributeRelease"): + raise CrawlError( + "ILIAS Shibboleth entitlements changed! " + "Please log in once in your browser and review them" + ) + if self._tfa_required(soup): soup = await self._authenticate_tfa(sess, soup) From da72863b471c048768a0d8234ba02298b1f9e4c1 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 3 Apr 2022 13:19:08 +0200 Subject: [PATCH 063/224] Placate newer mypy --- PFERD/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/logging.py b/PFERD/logging.py index e2d64fc..e833716 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -68,7 +68,7 @@ class Log: if self._download_progress.task_ids: elements.append(self._download_progress) - group = Group(*elements) # type: ignore + group = Group(*elements) self._live.update(group) @contextmanager From a2831fbea2e8758686677c44645fdd6f3cbc40fa Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 13:55:24 +0200 Subject: [PATCH 064/224] Fix shib authentication Authentication failed previously if the shib session was still valid. If Shibboleth gets a request and the session is still valid, it directly responds without a second redirect. --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 2a5fc87..571e4d7 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -784,15 +784,19 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") if not location: - raise CrawlWarning(f"Login failed, no location header present at {url}") + raise CrawlWarning(f"Login failed (1), no location header present at {url}") correct_url = yarl.URL(location, encoded=True) async with session.get(correct_url, allow_redirects=False) as response: - as_yarl = yarl.URL(response.url) location = response.headers.get("location") + # If shib still still has a valid session, it will directly respond to the request + if location is None: + return soupify(await response.read()) + as_yarl = yarl.URL(response.url) + # Probably not needed anymore, but might catch a few weird situations with a nicer message if not location or not as_yarl.host: - raise CrawlWarning(f"Login failed, no location header present at {correct_url}") + raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}") correct_url = yarl.URL.build( scheme=as_yarl.scheme, From f17b9b68f4cdc397b029361260d35aad7e778308 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 14:01:40 +0200 Subject: [PATCH 065/224] Add shibboleth authentication fix to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e11224..b3da789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ ambiguous situations. - Local video cache if `windows_paths` is enabled - Report when Shibboleth reviews entitlements - Support for video listings with more columns +- Authentication when the shib session is still valid ## 3.3.0 - 2022-01-09 From 07a21f80a63dfd4f47dae4dadc8e515334a9891d Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 21:15:33 +0200 Subject: [PATCH 066/224] Link to unofficial packages --- CHANGELOG.md | 3 +++ README.md | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3da789..c64b69a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Links to unofficial packages and repology in the readme + ## 3.3.1 - 2022-01-15 ### Fixed diff --git a/README.md b/README.md index 836147f..b8b2551 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,14 @@ $ pip install --upgrade git+https://github.com/Garmelon/PFERD@latest The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. +### With package managers + +Unofficial packages are available for: +- [AUR](https://aur.archlinux.org/packages/pferd) +- [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) + +See also PFERD's [repology page](https://repology.org/project/pferd/versions). + ## Basic usage PFERD can be run directly from the command line with no config file. Run `pferd From ba3d299c05bae299a3da5c378e9c5f311e78f62f Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 21:23:55 +0200 Subject: [PATCH 067/224] Fix changelog --- CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c64b69a..c5480f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,16 +23,18 @@ ambiguous situations. ## Unreleased ### Added +- Message when Shibboleth entitlements need to be manually reviewed +- Support for video listings with more columns - Links to unofficial packages and repology in the readme +### Fixed +- Crash during authentication when the Shibboleth session is still valid + ## 3.3.1 - 2022-01-15 ### Fixed - ILIAS login - Local video cache if `windows_paths` is enabled -- Report when Shibboleth reviews entitlements -- Support for video listings with more columns -- Authentication when the shib session is still valid ## 3.3.0 - 2022-01-09 From a99ddaa0cc28e04edfc95d541f0b1f6ca885965c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:47:51 +0200 Subject: [PATCH 068/224] Read and write config in UTF-8 --- PFERD/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PFERD/config.py b/PFERD/config.py index 0ea7abc..5635573 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -120,7 +120,7 @@ class Config: # Using config.read_file instead of config.read because config.read # would just ignore a missing file and carry on. try: - with open(path) as f: + with open(path, encoding="utf-8") as f: parser.read_file(f, source=str(path)) except FileNotFoundError: raise ConfigLoadError(path, "File does not exist") @@ -154,12 +154,12 @@ class Config: try: # x = open for exclusive creation, failing if the file already # exists - with open(path, "x") as f: + with open(path, "x", encoding="utf-8") as f: self._parser.write(f) except FileExistsError: print("That file already exists.") if asyncio.run(prompt_yes_no("Overwrite it?", default=False)): - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: self._parser.write(f) else: raise ConfigDumpError(path, "File already exists") From a709280cbf0bf5dbb62507f9829647862ef5f6bc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:48:09 +0200 Subject: [PATCH 069/224] Try to detect unsupported config file encoding The encoding detection is quite rudimentary, but should detect the default windows encoding in many cases. --- PFERD/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PFERD/config.py b/PFERD/config.py index 5635573..8f7e682 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -128,6 +128,8 @@ class Config: raise ConfigLoadError(path, "That's a directory, not a file") except PermissionError: raise ConfigLoadError(path, "Insufficient permissions") + except UnicodeDecodeError: + raise ConfigLoadError(path, "File is not encoded using UTF-8") def dump(self, path: Optional[Path] = None) -> None: """ From 00db34821825a719712f6bc25420bdfaed9bda11 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 27 Apr 2022 21:53:29 +0200 Subject: [PATCH 070/224] Update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5480f2..e70d328 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,9 +24,12 @@ ambiguous situations. ### Added - Message when Shibboleth entitlements need to be manually reviewed -- Support for video listings with more columns - Links to unofficial packages and repology in the readme +### Changed +- Support video listings with more columns +- Use UTF-8 when reading/writing the config file + ### Fixed - Crash during authentication when the Shibboleth session is still valid From 31631fb409d80f7c0cf8dd964da993ef08aa6fe5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:16:47 +0200 Subject: [PATCH 071/224] Increase minimum python version to 3.9 --- .github/workflows/build-and-release.yml | 2 +- CHANGELOG.md | 1 + README.md | 2 +- setup.cfg | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 565c4e3..090ac7e 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python: ["3.8"] + python: ["3.9"] steps: - uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index e70d328..7cee430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ ambiguous situations. - Links to unofficial packages and repology in the readme ### Changed +- Increase minimum supported Python version to 3.9 - Support video listings with more columns - Use UTF-8 when reading/writing the config file diff --git a/README.md b/README.md index b8b2551..ce917b0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Binaries for Linux, Windows and Mac can be downloaded directly from the ### With pip -Ensure you have at least Python 3.8 installed. Run the following command to +Ensure you have at least Python 3.9 installed. Run the following command to install PFERD or upgrade it to the latest version: ``` diff --git a/setup.cfg b/setup.cfg index 059798a..2378c48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ version = attr: PFERD.version.VERSION [options] packages = find: -python_requires = >=3.8 +python_requires = >=3.9 install_requires = aiohttp>=3.8.1 beautifulsoup4>=4.10.0 From 602044ff1b0b49348a50248f7f93334df979044a Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:50:06 +0200 Subject: [PATCH 072/224] Fix mypy errors and add missing await --- PFERD/crawl/crawler.py | 5 +++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 53f43e9..0e67c02 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -1,9 +1,10 @@ import asyncio import os from abc import ABC, abstractmethod +from collections.abc import Awaitable, Coroutine from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypeVar from ..auth import Authenticator from ..config import Config, Section @@ -58,7 +59,7 @@ def noncritical(f: Wrapped) -> Wrapped: return wrapper # type: ignore -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) +AWrapped = TypeVar("AWrapped", bound=Callable[..., Coroutine[Any, Any, Optional[Any]]]) def anoncritical(f: AWrapped) -> AWrapped: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 571e4d7..ae9ebd4 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,7 +1,8 @@ import asyncio import re +from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Awaitable, Callable, Dict, List, Optional, Set, TypeVar, Union, cast +from typing import Any, Callable, Dict, List, Optional, Set, Union, cast import aiohttp import yarl @@ -13,7 +14,7 @@ from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical +from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement @@ -82,8 +83,6 @@ _VIDEO_ELEMENTS: Set[IliasElementType] = set([ IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, ]) -AWrapped = TypeVar("AWrapped", bound=Callable[..., Awaitable[Optional[Any]]]) - def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: def decorator(f: AWrapped) -> AWrapped: @@ -252,7 +251,7 @@ instance's greatest bottleneck. url: str, parent: IliasPageElement, path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: maybe_cl = await self.crawl(path) if not maybe_cl: return None @@ -310,7 +309,7 @@ instance's greatest bottleneck. self, parent_path: PurePath, element: IliasPageElement, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: if element.url in self._visited_urls: raise CrawlWarning( f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" @@ -360,7 +359,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -407,7 +406,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: log.explain_topic(f"Decision: Crawl Booking Link {fmt_path(element_path)}") log.explain(f"Links type is {self._links}") @@ -443,7 +442,7 @@ instance's greatest bottleneck. if hdrs.LOCATION not in resp.headers: return soupify(await resp.read()).select_one("a").get("href").strip() - self._authenticate() + await self._authenticate() async with self.session.get(export_url, allow_redirects=False) as resp: # No redirect means we were authenticated @@ -456,7 +455,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: # Copy old mapping as it is likely still relevant if self.prev_report: self.report.add_custom_value( @@ -564,7 +563,7 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, - ) -> Optional[Awaitable[None]]: + ) -> Optional[Coroutine[Any, Any, None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return None From d2e6d918806310a3bcda7a82c74853b7f59eb99f Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 27 Apr 2022 22:50:36 +0200 Subject: [PATCH 073/224] Make PFERD executable via python -m --- PFERD/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PFERD/__main__.py b/PFERD/__main__.py index bdf5b34..4faeb13 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -159,3 +159,7 @@ def main() -> None: sys.exit(1) else: pferd.print_report() + + +if __name__ == "__main__": + main() From aa74604d293ec25ae7f94431d4431313dabfc26c Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:11:27 +0200 Subject: [PATCH 074/224] Use utf-8 for report --- PFERD/output_dir.py | 2 +- PFERD/report.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 441717b..c92f4a6 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -503,7 +503,7 @@ class OutputDirectory: try: self._prev_report = Report.load(self._report_path) log.explain("Loaded report successfully") - except (OSError, json.JSONDecodeError, ReportLoadError) as e: + except (OSError, UnicodeDecodeError, json.JSONDecodeError, ReportLoadError) as e: log.explain("Failed to load report") log.explain(str(e)) diff --git a/PFERD/report.py b/PFERD/report.py index 0e0c789..0eaaca9 100644 --- a/PFERD/report.py +++ b/PFERD/report.py @@ -100,10 +100,10 @@ class Report: @classmethod def load(cls, path: Path) -> "Report": """ - May raise OSError, JsonDecodeError, ReportLoadError. + May raise OSError, UnicodeDecodeError, JsonDecodeError, ReportLoadError. """ - with open(path) as f: + with open(path, encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict): @@ -148,7 +148,7 @@ class Report: "encountered_errors": self.encountered_errors, } - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, sort_keys=True) f.write("\n") # json.dump doesn't do this From b56475450de9a00a0ab12bfdf9adf9b5b229f38e Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:12:41 +0200 Subject: [PATCH 075/224] Use utf-8 for cookies --- PFERD/crawl/http_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index fa4cf29..44ec4dd 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -108,7 +108,7 @@ class HttpCrawler(Crawler): def _load_cookies_from_file(self, path: Path) -> None: jar: Any = http.cookies.SimpleCookie() - with open(path) as f: + with open(path, encoding="utf-8") as f: for i, line in enumerate(f): # Names of headers are case insensitive if line[:11].lower() == "set-cookie:": @@ -121,7 +121,7 @@ class HttpCrawler(Crawler): jar: Any = http.cookies.SimpleCookie() for morsel in self._cookie_jar: jar[morsel.key] = morsel - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(jar.output(sep="\n")) f.write("\n") # A trailing newline is just common courtesy From a8f76e9be76f4bb0ee24030ea252354ede1c8ce4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 29 Apr 2022 23:15:12 +0200 Subject: [PATCH 076/224] Use utf-8 for credential file --- PFERD/auth/credential_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PFERD/auth/credential_file.py b/PFERD/auth/credential_file.py index d0fcdda..94ffa73 100644 --- a/PFERD/auth/credential_file.py +++ b/PFERD/auth/credential_file.py @@ -20,8 +20,10 @@ class CredentialFileAuthenticator(Authenticator): path = config.default_section.working_dir() / section.path() try: - with open(path) as f: + with open(path, encoding="utf-8") as f: lines = list(f) + except UnicodeDecodeError: + raise AuthLoadError(f"Credential file at {fmt_real_path(path)} is not encoded using UTF-8") except OSError as e: raise AuthLoadError(f"No credential file at {fmt_real_path(path)}") from e From a241672726529d1a0ed852b1db2df7968ee6f137 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 1 May 2022 22:29:06 +0200 Subject: [PATCH 077/224] Bump version to 3.4.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cee430..310059a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.0 - 2022-05-01 + ### Added - Message when Shibboleth entitlements need to be manually reviewed - Links to unofficial packages and repology in the readme diff --git a/PFERD/version.py b/PFERD/version.py index 37e91f3..8102d37 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.3.1" +VERSION = "3.4.0" From b8fe25c580a8cafc14c32890f0635c7daecafc4d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 May 2022 14:13:39 +0200 Subject: [PATCH 078/224] Add `.cpp` to ipd link regex --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 310059a..22fdd29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Add `.cpp` to IPD link regex + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 1a5314b..e5ec58f 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$") + regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") return re.compile(regex) From afbd03f7774a1c0f22c471d98f995153bb08edcd Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:15:48 +0200 Subject: [PATCH 079/224] Fix docs --- CHANGELOG.md | 2 +- CONFIG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22fdd29..f5af29d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ ambiguous situations. ## Unreleased ### Changed -- Add `.cpp` to IPD link regex +- Add `cpp` extension to default `link_regex` of IPD crawler ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 569780d..1355c34 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler From bc3fa36637b5a4f4ea26db1a9437e4cbd5cad5c4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:20:45 +0200 Subject: [PATCH 080/224] Fix IPD crawler crashing on weird HTML comments --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5af29d..de7b795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +### Fixed +- IPD crawler crashes on some sites + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index e5ec58f..58e71f8 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler): async def get_page(self) -> BeautifulSoup: async with self.session.get(self._url) as request: - return soupify(await request.read()) + # The web page for Algorithmen für Routenplanung contains some + # weird comments that beautifulsoup doesn't parse correctly. This + # hack enables those pages to be crawled, and should hopefully not + # cause issues on other pages. + content = (await request.read()).decode("utf-8") + content = re.sub(r"", "", content) + return soupify(content.encode("utf-8")) From af2cc1169ace7154349518f7f709023eeb76ba95 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:23:19 +0200 Subject: [PATCH 081/224] Mention href for users of link_regex option --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de7b795..959fda0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +- Mention hrefs in IPD crawler for users of `link_regex` option ### Fixed - IPD crawler crashes on some sites diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 58e71f8..78fe0b1 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -45,7 +45,7 @@ class KitIpdFolder: def explain(self) -> None: log.explain_topic(f"Folder {self.name!r}") for file in self.files: - log.explain(f"File {file.name!r}") + log.explain(f"File {file.name!r} (href={file.url!r})") def __hash__(self) -> int: return self.name.__hash__() @@ -113,7 +113,7 @@ class KitIpdCrawler(HttpCrawler): else: file = self._extract_file(element) items.add(file) - log.explain_topic(f"Orphan file {file.name!r}") + log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items From 694ffb4d7711265d768a636cf1843e302485c62d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:28:30 +0200 Subject: [PATCH 082/224] Fix meeting date parsing Apparently the new pattern ": ," was added. This patch adds support for it. --- PFERD/crawl/ilias/kit_ilias_html.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 94b2e4b..dfe111d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -763,9 +763,14 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti """ try: date_str = re.sub(r"\s+", " ", date_str) + date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) + date_str = re.sub("(Heute|Today):", "", date_str, re.I) + date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) + date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" From bcc537468c46088f78a037fb28364866e8653bb5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:53:37 +0200 Subject: [PATCH 083/224] Fix crawling of expanded meetings The last meeting on every page is expanded by default. Its content is then shown inline *and* in the meeting page itself. We should skip the inline content. --- PFERD/crawl/ilias/kit_ilias_html.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index dfe111d..d93684c 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -428,6 +428,12 @@ class IliasPage: element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) + # The last meeting on every page is expanded by default. + # Its content is then shown inline *and* in the meeting page itself. + # We should skip the inline content. + if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link): + continue + if not element_type: continue if element_type == IliasElementType.MEETING: @@ -445,6 +451,26 @@ class IliasPage: return result + def _is_in_expanded_meeting(self, tag: Tag) -> bool: + """ + Returns whether a file is part of an expanded meeting. + Has false positives for meetings themselves as their title is also "in the expanded meeting content". + It is in the same general div and this whole thing is guesswork. + Therefore, you should check for meetings before passing them in this function. + """ + parents: List[Tag] = list(tag.parents) + for parent in parents: + if not parent.get("class"): + continue + + # We should not crawl files under meetings + if "ilContainerListItemContentCB" in parent.get("class"): + link: Tag = parent.parent.find("a") + type = IliasPage._find_type_from_folder_like(link, self._page_url) + return type == IliasElementType.MEETING + + return False + def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: """ Interprets accordions and expandable blocks as virtual folders and returns them From 2f0e04ce13ebbc7c7ccaa93e03d8f707f246ceef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:57:55 +0200 Subject: [PATCH 084/224] Adjust changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 959fda0..4249287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ ambiguous situations. ### Fixed - IPD crawler crashes on some sites +- Meeting name normalization for yesterday, today and tomorrow fails +- Crawling of meeting file previews ## 3.4.0 - 2022-05-01 From 616b0480f7c92afe11c36d2c105c99ba5f960e96 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 8 May 2022 17:39:18 +0200 Subject: [PATCH 085/224] Simplify IPD crawler link regex --- CHANGELOG.md | 5 +++-- CONFIG.md | 2 +- PFERD/crawl/kit_ipd_crawler.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4249287..e2d3840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,11 +24,12 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler -- Mention hrefs in IPD crawler for users of `link_regex` option +- Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option +- Simplify default IPD crawler `link_regex` ### Fixed - IPD crawler crashes on some sites -- Meeting name normalization for yesterday, today and tomorrow fails +- Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 1355c34..f572a80 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) + files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 78fe0b1..d9fac32 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") + regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") return re.compile(regex) From a5015fe9b16d484613a27687f2c122b15e109ba2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 8 May 2022 23:21:18 +0200 Subject: [PATCH 086/224] Correctly parse day-only meeting dates I failed to recognize the correct format in the previous adjustment, so this (hopefully) fixes it for good. Meetings apparently don't always have a time portion. --- PFERD/crawl/ilias/kit_ilias_html.py | 48 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d93684c..6d063b6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -746,17 +746,26 @@ class IliasPage: Normalizes meeting names, which have a relative time as their first part, to their date in ISO format. """ - date_portion_str = meeting_name.split(" - ")[0] + + # This checks whether we can reach a `:` without passing a `-` + if re.search(r"^[^-]+: ", meeting_name): + # Meeting name only contains date: "05. Jan 2000:" + split_delimiter = ":" + else: + # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" + split_delimiter = ", " + + # We have a meeting day without time + date_portion_str = meeting_name.split(split_delimiter)[0] date_portion = demangle_date(date_portion_str) + # We failed to parse the date, bail out if not date_portion: return meeting_name - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + # Replace the first section with the absolute date + rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) + return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name def _abs_url_from_link(self, link_tag: Tag) -> str: """ @@ -781,17 +790,15 @@ english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ - Demangle a given date in one of the following formats: + Demangle a given date in one of the following formats (hour/minute part is optional): "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" "dd. mon yyyy, HH:MM """ try: + # Normalize whitespace because users date_str = re.sub(r"\s+", " ", date_str) - date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) - date_str = re.sub("(Heute|Today):", "", date_str, re.I) - date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) @@ -802,19 +809,28 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" date_str = date_str.replace(english + ".", english) - # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" - day_part, time_part = date_str.split(",") + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" + + # Check if we have a time as well + if ", " in date_str: + day_part, time_part = date_str.split(",") + else: + day_part = date_str.split(",")[0] + time_part = None + day_str, month_str, year_str = day_part.split(" ") day = int(day_str.strip().replace(".", "")) month = english_months.index(month_str.strip()) + 1 year = int(year_str.strip()) - hour_str, minute_str = time_part.split(":") - hour = int(hour_str) - minute = int(minute_str) + if time_part: + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + return datetime(year, month, day, hour, minute) - return datetime(year, month, day, hour, minute) + return datetime(year, month, day) except Exception: if not fail_silently: log.warn(f"Date parsing failed for {date_str!r}") From 846c29aee1867f7f0b7efae802af47fee77a3ec6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 11 May 2022 21:16:09 +0200 Subject: [PATCH 087/224] Download page descriptions --- CHANGELOG.md | 3 + PFERD/crawl/ilias/ilias_html_cleaner.py | 91 ++++++++++++++++++++++ PFERD/crawl/ilias/kit_ilias_html.py | 25 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++ 4 files changed, 148 insertions(+) create mode 100644 PFERD/crawl/ilias/ilias_html_cleaner.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d3840..b7cad13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Download of page descriptions + ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py new file mode 100644 index 0000000..5952309 --- /dev/null +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup, Comment, Tag + +_STYLE_TAG_CONTENT = """ + .ilc_text_block_Information { + background-color: #f5f7fa; + } + div.ilc_text_block_Standard { + margin-bottom: 10px; + margin-top: 10px; + } + span.ilc_text_inline_Strong { + font-weight: bold; + } + + .accordion-head { + background-color: #f5f7fa; + padding: 0.5rem 0; + } + + h3 { + margin-top: 0.5rem; + margin-bottom: 1rem; + } + + br.visible-break { + margin-bottom: 1rem; + } + + article { + margin: 0.5rem 0; + } + + body { + padding: 1em; + grid-template-columns: 1fr min(60rem, 90%) 1fr; + line-height: 1.2; + } +""" + +_ARTICLE_WORTHY_CLASSES = [ + "ilc_text_block_Information", + "ilc_section_Attention", + "ilc_section_Link", +] + + +def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: + head = soup.new_tag("head") + soup.insert(0, head) + + simplecss_link: Tag = soup.new_tag("link") + # + simplecss_link["rel"] = "stylesheet" + simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" + head.append(simplecss_link) + + # Basic style tags for compat + style: Tag = soup.new_tag("style") + style.append(_STYLE_TAG_CONTENT) + head.append(style) + + return soup + + +def clean(soup: BeautifulSoup) -> BeautifulSoup: + for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + block.name = "article" + + for block in soup.find_all("h3"): + block.name = "div" + + for block in soup.find_all("h1"): + block.name = "h3" + + for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + block.name = "h3" + block["class"] += ["accordion-head"] + + for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): + children = list(dummy.children) + if not children: + dummy.decompose() + if len(children) > 1: + continue + if type(children[0]) == Comment: + dummy.decompose() + + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + hrule_imposter.insert(0, soup.new_tag("hr")) + + return soup diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 6d063b6..d58e5c8 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -85,6 +85,31 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_description(self) -> Optional[BeautifulSoup]: + def is_interesting_class(name: str) -> bool: + return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] + + paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + if not paragraphs: + return None + + # Extract bits and pieces into a string and parse it again. + # This ensures we don't miss anything and weird structures are resolved + # somewhat gracefully. + raw_html = "" + for p in paragraphs: + if p.find_parent(class_=is_interesting_class): + continue + + # Ignore special listings (like folder groupings) + if "ilc_section_Special" in p["class"]: + continue + + raw_html += str(p) + "\n" + raw_html = f"\n{raw_html}\n" + + return BeautifulSoup(raw_html, "html.parser") + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae9ebd4..bbed986 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links +from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -215,6 +216,8 @@ instance's greatest bottleneck. cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling url") async def gather_elements() -> None: @@ -233,9 +236,15 @@ instance's greatest bottleneck. page = IliasPage(soup, url, None) elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) + # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -265,6 +274,8 @@ instance's greatest bottleneck. cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling folder") async def gather_elements() -> None: @@ -285,10 +296,15 @@ instance's greatest bottleneck. next_stage_url = None elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -425,6 +441,19 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical + @_iorepeat(1, "downloading description") + async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: + path = parent_path / "Description.html" + dl = await self.download(path, redownload=Redownload.ALWAYS) + if not dl: + return + + async with dl as (bar, sink): + description = clean(insert_base_markup(description)) + sink.file.write(description.prettify().encode("utf-8")) + sink.done() + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking( From 46fb782798725b6fde76b71cf7a4d90912ea2c7d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 24 May 2022 23:28:09 +0200 Subject: [PATCH 088/224] Add forum crawling This downloads all forum posts when needed and saves each thread in its own html file, named after the thread title. --- CHANGELOG.md | 1 + PFERD/cli/command_kit_ilias_web.py | 7 ++ PFERD/crawl/ilias/kit_ilias_html.py | 90 ++++++++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 122 ++++++++++++++++++--- PFERD/logging.py | 4 +- 5 files changed, 208 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7cad13..1d70c4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Added - Download of page descriptions +- Forum download support ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 12803a6..de74fc3 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -62,6 +62,11 @@ GROUP.add_argument( action=BooleanOptionalAction, help="crawl and download videos" ) +GROUP.add_argument( + "--forums", + action=BooleanOptionalAction, + help="crawl and download forum posts" +) GROUP.add_argument( "--http-timeout", "-t", type=float, @@ -90,6 +95,8 @@ def load( section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: section["videos"] = "yes" if args.videos else "no" + if args.forums is not None: + section["forums"] = "yes" if args.forums else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d58e5c8..7bab152 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -55,6 +55,20 @@ class IliasPageElement: return self.url +@dataclass +class IliasDownloadForumData: + url: str + form_data: Dict[str, Union[str, List[str]]] + + +@dataclass +class IliasForumThread: + title: str + title_tag: Tag + content_tag: Tag + mtime: Optional[datetime] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -110,13 +124,39 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: + form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + if not form: + return None + post_url = self._abs_url_from_relative(form["action"]) + + form_data: Dict[str, Union[str, List[ſtr]]] = { + "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "selected_cmd2": "html", + "select_cmd2": "Ausführen", + "selected_cmd": "", + } + + return IliasDownloadForumData(post_url, form_data) + def get_next_stage_element(self) -> Optional[IliasPageElement]: + if self._is_forum_page(): + if "trows=800" in self._page_url: + return None + return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: return self._find_video_entries_paginated()[0] return None + def _is_forum_page(self) -> bool: + read_more_btn = self._soup.find( + "button", + attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + ) + return read_more_btn is not None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) @@ -194,6 +234,19 @@ class IliasPage: return items + def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: + correct_link = self._soup.find( + "a", + attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} + ) + + if not correct_link: + return None + + link = self._abs_url_from_link(correct_link) + + return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] @@ -877,3 +930,38 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: + elements = [] + for p in forum_export.select("body > p"): + title_tag = p + content_tag = p.find_next_sibling("ul") + title = p.find("b").text + if ":" in title: + title = title[title.find(":") + 1:] + title = title.strip() + mtime = _guess_timestamp_from_forum_post_content(content_tag) + elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) + + return elements + + +def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: + posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") + if not posts: + return None + + newest_date: Optional[datetime] = None + + for post in posts: + text = post.text.strip() + text = text[text.rfind("|") + 1:] + date = demangle_date(text, fail_silently=True) + if not date: + continue + + if not newest_date or newest_date < date: + newest_date = date + + return newest_date diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bbed986..156cd4c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -18,7 +18,8 @@ from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadTo from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, + _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -67,6 +68,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) + def forums(self) -> bool: + return self.s.getboolean("forums", fallback=False) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -183,6 +187,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._forums = section.forums() self._visited_urls: Set[str] = set() async def _run(self) -> None: @@ -335,22 +340,27 @@ instance's greatest bottleneck. element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: - log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") if not self._videos: - log.explain("Video crawling is disabled") - log.explain("Answer: no") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'videos')" + ) return None - else: - log.explain("Video crawling is enabled") - log.explain("Answer: yes") if element.type == IliasElementType.FILE: return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Forums are not supported") - log.explain("Answer: No") - return None + if not self._forums: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'forums')" + ) + return None + return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") @@ -635,6 +645,68 @@ instance's greatest bottleneck. if not await try_stream(): raise CrawlError("File streaming failed after authenticate()") + async def _handle_forum( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_forum(element, maybe_cl) + + @_iorepeat(3, "crawling forum") + @anoncritical + async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements = [] + + async with cl: + next_stage_url = element.url + while next_stage_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + + soup = await self._get_page(next_stage_url) + page = IliasPage(soup, next_stage_url, None) + + if next := page.get_next_stage_element(): + next_stage_url = next.url + else: + break + + download_data = page.get_download_forum_data() + if not download_data: + raise CrawlWarning("Failed to extract forum data") + html = await self._post_authenticated(download_data.url, download_data.form_data) + elements = parse_ilias_forum_export(soupify(html)) + + elements.sort(key=lambda elem: elem.title) + + tasks: List[Awaitable[None]] = [] + for elem in elements: + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + + # And execute them + await self.gather(tasks) + + @anoncritical + @_iorepeat(3, "saving forum thread") + async def _download_forum_thread( + self, + parent_path: PurePath, + element: IliasForumThread, + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path, mtime=element.mtime) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + content = element.title_tag.prettify() + content += element.content_tag.prettify() + sink.file.write(content.encode("utf-8")) + sink.done() + async def _get_page(self, url: str) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -652,13 +724,37 @@ instance's greatest bottleneck. return soup raise CrawlError("get_page failed even after authenticating") + async def _post_authenticated( + self, + url: str, + data: dict[str, Union[str, List[str]]] + ) -> BeautifulSoup: + auth_id = await self._current_auth_id() + + form_data = aiohttp.FormData() + for key, val in data.items(): + form_data.add_field(key, val) + + async with self.session.post(url, data=form_data(), allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.post(url, data=data, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("post_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login", failure_is_error=True) + @ _iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @staticmethod + @ staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") diff --git a/PFERD/logging.py b/PFERD/logging.py index e833716..340b21f 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -197,7 +197,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def status(self, style: str, action: str, text: str) -> None: + def status(self, style: str, action: str, text: str, suffix: str = "") -> None: """ Print a status update while crawling. Allows markup in the "style" argument which will be applied to the "action" string. @@ -205,7 +205,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_status: action = escape(f"{action:<{self.STATUS_WIDTH}}") - self.print(f"{style}{action}[/] {escape(text)}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") def report(self, text: str) -> None: """ From ed24366aba7cfb8ca3cdd0df7b2650bc1220437f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 16:23:37 +0100 Subject: [PATCH 089/224] Add pass authenticator --- CHANGELOG.md | 1 + CONFIG.md | 21 ++++++++- PFERD/auth/__init__.py | 3 ++ PFERD/auth/pass_.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 PFERD/auth/pass_.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d70c4a..bc9f3e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - Download of page descriptions - Forum download support +- `pass` authenticator ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/CONFIG.md b/CONFIG.md index f572a80..0f114ed 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -223,6 +223,23 @@ is stored in the keyring. - `keyring_name`: The service name PFERD uses for storing credentials. (Default: `PFERD`) +### The `pass` authenticator + +This authenticator queries the [`pass` password manager][3] for a username and +password. It tries to be mostly compatible with [browserpass][4] and +[passff][5], so see those links for an overview of the format. If PFERD fails +to load your password, you can use the `--explain` flag to see why. + +- `passname`: The name of the password to use (Required) +- `username_prefixes`: A comma-separated list of username line prefixes + (Default: `login,username,user`) +- `password_prefixes`: A comma-separated list of password line prefixes + (Default: `password,pass,secret`) + +[3]: "Pass: The Standard Unix Password Manager" +[4]: "Organizing password store" +[5]: "Multi-line format" + ### The `tfa` authenticator This authenticator prompts the user on the console for a two-factor @@ -316,7 +333,7 @@ is a regular expression and `TARGET` an f-string based template. If a path matches `SOURCE`, the output path is created using `TARGET` as template. `SOURCE` is automatically anchored. -`TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can +`TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can be referred to as `{g}` (e.g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is available as `{i}` (e.g. `{i3}`). If capturing group *n*'s contents are a @@ -337,7 +354,7 @@ Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` - Converts `fooooo/bear` into `BOOOOOH/fear` - Converts `foo/bar/baz` into `BOOH/fear/baz` -[3]: "Format String Syntax" +[6]: "Format String Syntax" ### The `-name-re->` arrow diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 277cade..aa3ba8e 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -5,6 +5,7 @@ from ..config import Config from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection # noqa: F401 from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection from .keyring import KeyringAuthenticator, KeyringAuthSection +from .pass_ import PassAuthenticator, PassAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -19,6 +20,8 @@ AUTHENTICATORS: Dict[str, AuthConstructor] = { CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), "keyring": lambda n, s, c: KeyringAuthenticator(n, KeyringAuthSection(s)), + "pass": lambda n, s, c: + PassAuthenticator(n, PassAuthSection(s)), "simple": lambda n, s, c: SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: diff --git a/PFERD/auth/pass_.py b/PFERD/auth/pass_.py new file mode 100644 index 0000000..4c8e775 --- /dev/null +++ b/PFERD/auth/pass_.py @@ -0,0 +1,98 @@ +import re +import subprocess +from typing import List, Tuple + +from ..logging import log +from .authenticator import Authenticator, AuthError, AuthSection + + +class PassAuthSection(AuthSection): + def passname(self) -> str: + if (value := self.s.get("passname")) is None: + self.missing_value("passname") + return value + + def username_prefixes(self) -> List[str]: + value = self.s.get("username_prefixes", "login,username,user") + return [prefix.lower() for prefix in value.split(",")] + + def password_prefixes(self) -> List[str]: + value = self.s.get("password_prefixes", "password,pass,secret") + return [prefix.lower() for prefix in value.split(",")] + + +class PassAuthenticator(Authenticator): + PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)" # to be used with fullmatch + + def __init__(self, name: str, section: PassAuthSection) -> None: + super().__init__(name) + + self._passname = section.passname() + self._username_prefixes = section.username_prefixes() + self._password_prefixes = section.password_prefixes() + + async def credentials(self) -> Tuple[str, str]: + log.explain_topic("Obtaining credentials from pass") + + try: + log.explain(f"Calling 'pass show {self._passname}'") + result = subprocess.check_output(["pass", "show", self._passname], text=True) + except subprocess.CalledProcessError as e: + raise AuthError(f"Failed to get password info from {self._passname}: {e}") + + prefixed = {} + unprefixed = [] + for line in result.strip().splitlines(): + if match := re.fullmatch(self.PREFIXED_LINE_RE, line): + prefix = match.group(1).lower() + value = match.group(2) + log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") + if prefix in prefixed: + raise AuthError(f"Prefix {prefix} specified multiple times") + prefixed[prefix] = value + else: + log.explain(f"Found unprefixed line {line!r}") + unprefixed.append(line) + + username = None + for prefix in self._username_prefixes: + log.explain(f"Looking for username at prefix {prefix!r}") + if prefix in prefixed: + username = prefixed[prefix] + log.explain(f"Found username {username!r}") + break + + password = None + for prefix in self._password_prefixes: + log.explain(f"Looking for password at prefix {prefix!r}") + if prefix in prefixed: + password = prefixed[prefix] + log.explain(f"Found password {password!r}") + break + + if password is None and username is None: + log.explain("No username and password found so far") + log.explain("Using first unprefixed line as password") + log.explain("Using second unprefixed line as username") + elif password is None: + log.explain("No password found so far") + log.explain("Using first unprefixed line as password") + elif username is None: + log.explain("No username found so far") + log.explain("Using first unprefixed line as username") + + if password is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Password could not be determined") + password = unprefixed.pop(0) + log.explain(f"Found password {password!r}") + + if username is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Username could not be determined") + username = unprefixed.pop(0) + log.explain(f"Found username {username!r}") + + return username, password From 345f52a1f6f55eecf6c31d3cc1a4350c5200087d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:41:29 +0200 Subject: [PATCH 090/224] Detect new login button --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 156cd4c..c99a920 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -759,7 +759,7 @@ instance's greatest bottleneck. # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: - login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login From d9b111cec252f4b1810f06b0f2ca551cb5cdb2a2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:45:33 +0200 Subject: [PATCH 091/224] Correctly nest description entries --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c99a920..1852c5f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -308,7 +308,7 @@ instance's greatest bottleneck. await gather_elements() if description: - await self._download_description(PurePath("."), description[0]) + await self._download_description(cl.path, description[0]) elements.sort(key=lambda e: e.id()) From aa5a3a10bcbfa0dd54a0dc1a533625f76b2d6ed8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:48:59 +0200 Subject: [PATCH 092/224] Adjust changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc9f3e5..7f35c9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,9 @@ ambiguous situations. - IPD crawler crashes on some sites - Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews +- Login with new login button html layout +- Descriptions for courses are now placed in the correct subfolder when + downloading the whole desktop ## 3.4.0 - 2022-05-01 From 66a5b1ba0223848f713192b084f2dcd26a18dbe5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 17 Aug 2022 13:24:01 +0200 Subject: [PATCH 093/224] Bump version to 3.4.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f35c9c..671d48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.1 - 2022-08-17 + ### Added - Download of page descriptions - Forum download support diff --git a/PFERD/version.py b/PFERD/version.py index 8102d37..8832a51 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.0" +VERSION = "3.4.1" From 4a51aaa4f5a1b3382f0bed59f1292fc0952c2832 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 Oct 2022 22:59:33 +0200 Subject: [PATCH 094/224] Fix forum crawling crashing for empty threads --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 671d48a..70d2cd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Forum crawling crashing when parsing empty (= 0 messages) threads + ## 3.4.1 - 2022-08-17 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7bab152..8795512 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -937,6 +937,13 @@ def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThre for p in forum_export.select("body > p"): title_tag = p content_tag = p.find_next_sibling("ul") + + if not content_tag: + # ILIAS allows users to delete the initial post while keeping the thread open + # This produces empty threads without *any* content. + # I am not sure why you would want this, but ILIAS makes it easy to do. + continue + title = p.find("b").text if ":" in title: title = title[title.find(":") + 1:] From d72fc2760b1dd8243ccf21876bb8cc6e027944bb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:09:29 +0200 Subject: [PATCH 095/224] Handle empty forums --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70d2cd5..c7a9899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a forum has no threads at all ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 8795512..9ea6b9f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -59,6 +59,7 @@ class IliasPageElement: class IliasDownloadForumData: url: str form_data: Dict[str, Union[str, List[str]]] + empty: bool @dataclass @@ -130,14 +131,16 @@ class IliasPage: return None post_url = self._abs_url_from_relative(form["action"]) + thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + form_data: Dict[str, Union[str, List[ſtr]]] = { - "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", } - return IliasDownloadForumData(post_url, form_data) + return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_forum_page(): diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 1852c5f..f2d5215 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -658,7 +658,7 @@ instance's greatest bottleneck. @_iorepeat(3, "crawling forum") @anoncritical async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: - elements = [] + elements: List[IliasForumThread] = [] async with cl: next_stage_url = element.url @@ -677,6 +677,10 @@ instance's greatest bottleneck. download_data = page.get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") + if download_data.empty: + log.explain("Forum had no threads") + elements = [] + return html = await self._post_authenticated(download_data.url, download_data.form_data) elements = parse_ilias_forum_export(soupify(html)) From fb4631ba180a9ff0303d59e798d4bccfa0253666 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:13:36 +0200 Subject: [PATCH 096/224] Fix ilias background login --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index f2d5215..10a270f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -23,6 +23,12 @@ from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, Ilia TargetType = Union[str, int] +_ILIAS_URL = "https://ilias.studium.kit.edu" + + +class KitShibbolethBackgroundLoginSuccessful(): + pass + class KitIliasWebCrawlerSection(HttpCrawlerSection): def target(self) -> TargetType: @@ -36,7 +42,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): if target == "desktop": # Full personal desktop return target - if target.startswith("https://ilias.studium.kit.edu"): + if target.startswith(_ILIAS_URL): # ILIAS URL return target @@ -181,7 +187,7 @@ instance's greatest bottleneck. section.tfa_auth(authenticators), ) - self._base_url = "https://ilias.studium.kit.edu" + self._base_url = _ILIAS_URL self._target = section.target() self._link_file_redirect_delay = section.link_redirect_delay() @@ -808,14 +814,17 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/shib_login.php" + url = f"{_ILIAS_URL}/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "il_target": "", "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _shib_post(sess, url, data) + soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) + + if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): + return # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -854,7 +863,7 @@ class KitShibbolethLogin: # (or clicking "Continue" if you have JS disabled) relay_state = soup.find("input", {"name": "RelayState"}) saml_response = soup.find("input", {"name": "SAMLResponse"}) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" data = { # using the info obtained in the while loop above "RelayState": relay_state["value"], "SAMLResponse": saml_response["value"], @@ -903,22 +912,35 @@ async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> Beautifu return soupify(await response.read()) -async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: +async def _shib_post( + session: aiohttp.ClientSession, + url: str, + data: Any +) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: """ aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and build encoded URL objects ourselves... Who thought mangling location header was a good idea?? """ + log.explain_topic("Shib login POST") async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Got location {location!r}") if not location: raise CrawlWarning(f"Login failed (1), no location header present at {url}") correct_url = yarl.URL(location, encoded=True) + log.explain(f"Corrected location to {correct_url!r}") + + if str(correct_url).startswith(_ILIAS_URL): + log.explain("ILIAS recognized our shib token and logged us in in the background, returning") + return KitShibbolethBackgroundLoginSuccessful() async with session.get(correct_url, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Redirected to {location!r} with status {response.status}") # If shib still still has a valid session, it will directly respond to the request if location is None: + log.explain("Shib recognized us, returning its response directly") return soupify(await response.read()) as_yarl = yarl.URL(response.url) @@ -932,6 +954,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea path=location, encoded=True ) + log.explain(f"Corrected location to {correct_url!r}") async with session.get(correct_url, allow_redirects=False) as response: return soupify(await response.read()) From 5fdd40204b156b15c008ec1dee05e168672fe243 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 14:33:58 +0200 Subject: [PATCH 097/224] Unwrap future meetings when ILIAS hides them behind a pagination --- PFERD/crawl/ilias/kit_ilias_html.py | 20 +++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 ++++++++++++++-------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9ea6b9f..2f0011e 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -146,11 +146,17 @@ class IliasPage: if self._is_forum_page(): if "trows=800" in self._page_url: return None + log.explain("Requesting *all* forum threads") return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): + log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + log.explain("Unwrapping video pagination") return self._find_video_entries_paginated()[0] + if self._contains_collapsed_future_meetings(): + log.explain("Requesting *all* future meetings") + return self._uncollapse_future_meetings_url() return None def _is_forum_page(self) -> bool: @@ -203,6 +209,16 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: + return self._uncollapse_future_meetings_url() is not None + + def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: + element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) + if not element: + return None + link = self._abs_url_from_link(element) + return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -793,6 +809,10 @@ class IliasPage: if img_tag is None: img_tag = found_parent.select_one("img.icon") + if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): + log.explain("Found session expansion button, skipping it as it has no content") + return None + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 10a270f..bc0d816 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -234,19 +234,28 @@ instance's greatest bottleneck. async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) - - if expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError("Invalid course id? Didn't find anything looking like a course") + next_stage_url: Optional[str] = url + current_parent = None # Duplicated code, but the root page is special - we want to avoid fetching it twice! - log.explain_topic("Parsing root HTML page") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, None) - elements.extend(page.get_child_elements()) + while next_stage_url: + soup = await self._get_page(next_stage_url) + if current_parent is None and expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + raise CrawlError("Invalid course id? Didn't find anything looking like a course") + + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None + + elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string) From e1430e629844ad122a78d18197ed54100c734bbb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:34 +0200 Subject: [PATCH 098/224] Handle (and ignore) surveys --- PFERD/crawl/ilias/kit_ilias_html.py | 3 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 2f0011e..d969577 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -24,6 +24,7 @@ class IliasElementType(Enum): LINK = "link" BOOKING = "booking" MEETING = "meeting" + SURVEY = "survey" VIDEO = "video" VIDEO_PLAYER = "video_player" VIDEO_FOLDER = "video_folder" @@ -730,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "svy" in icon["class"]: + return IliasElementType.SURVEY _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bc0d816..5ff8212 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -380,6 +380,13 @@ instance's greatest bottleneck. log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") log.explain("Answer: No") + elif element.type == IliasElementType.SURVEY: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](surveys contain no relevant data)" + ) return None elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) From 1b6be6bd79112faea6e56c43f4756dde10ba00ba Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:54 +0200 Subject: [PATCH 099/224] Handle content pages in cards --- PFERD/crawl/ilias/kit_ilias_html.py | 2 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d969577..ee0364a 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -731,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "copa" in icon["class"]: + return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5ff8212..9295e93 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -377,9 +377,13 @@ instance's greatest bottleneck. return None return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Tests contain no relevant files") - log.explain("Answer: No") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](tests contain no relevant data)" + ) + return None elif element.type == IliasElementType.SURVEY: log.status( "[bold bright_black]", From f47d2f11d843bfd3307815b231dd3e3df0265cef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 Oct 2022 20:28:06 +0200 Subject: [PATCH 100/224] Append trailing slash to kit-ipd links to ensure urljoin works as expected --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a9899..24d9fa6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads - Forum crawling crashing when a forum has no threads at all +- kit-ipd crawler if URL did not end with a trailing slash ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index d9fac32..338e059 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -24,6 +24,9 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") + if not target.endswith("/"): + target = target + "/" + return target def link_regex(self) -> Pattern[str]: From 37b51a66d87d368afc3bef2b81edf1629f95cd57 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:22:37 +0200 Subject: [PATCH 101/224] Update changelog --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24d9fa6..2bb0231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,10 +22,16 @@ ambiguous situations. ## Unreleased +### Added +- Recognize and crawl content pages in cards +- Recognize and ignore surveys + ### Fixed -- Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a thread has no messages at all - Forum crawling crashing when a forum has no threads at all -- kit-ipd crawler if URL did not end with a trailing slash +- Ilias login failing in some cases +- Crawling of paginated future meetings +- IPD crawler handling of URLs without trailing slash ## 3.4.1 - 2022-08-17 From 259cfc20cccae68a2f34984796405a35a7f31707 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:26:17 +0200 Subject: [PATCH 102/224] Bump version to 3.4.2 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb0231..9ecddf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.2 - 2022-10-26 + ### Added - Recognize and crawl content pages in cards - Recognize and ignore surveys diff --git a/PFERD/version.py b/PFERD/version.py index 8832a51..0ef5d89 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.1" +VERSION = "3.4.2" From c020cccc64f152882688b119416f0582ec94e074 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Oct 2022 14:08:29 +0200 Subject: [PATCH 103/224] Include found paths in "second path found" warning --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 8 +++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ecddf7..3dd25b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Clear up error message shown when multiple paths are found to an element + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ee0364a..56dcf7b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -134,7 +134,7 @@ class IliasPage: thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] - form_data: Dict[str, Union[str, List[ſtr]]] = { + form_data: Dict[str, Union[str, List[str]]] = { "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 9295e93..e3719b8 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -194,7 +194,7 @@ instance's greatest bottleneck. self._links = section.links() self._videos = section.videos() self._forums = section.forums() - self._visited_urls: Set[str] = set() + self._visited_urls: Dict[str, PurePath] = dict() async def _run(self) -> None: if isinstance(self._target, int): @@ -348,9 +348,11 @@ instance's greatest bottleneck. ) -> Optional[Coroutine[Any, Any, None]]: if element.url in self._visited_urls: raise CrawlWarning( - f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + f"Found second path to element {element.name!r} at {element.url!r}. " + + f"First path: {fmt_path(self._visited_urls[element.url])}. " + + f"Second path: {fmt_path(parent_path)}." ) - self._visited_urls.add(element.url) + self._visited_urls[element.url] = parent_path element_path = PurePath(parent_path, element.name) From 07200bbde5fb72f2f846101b92b440724c8c7959 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 Oct 2022 14:10:45 +0100 Subject: [PATCH 104/224] Document ilias web crawler's forums option --- CHANGELOG.md | 3 +++ CONFIG.md | 1 + 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dd25b8..e5e81d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Missing documentation for `forums` option + ### Changed - Clear up error message shown when multiple paths are found to an element diff --git a/CONFIG.md b/CONFIG.md index 0f114ed..1ca43c4 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -181,6 +181,7 @@ script once per day should be fine. redirect to the actual URL. Set to a negative value to disable the automatic redirect. (Default: `-1`) - `videos`: Whether to download videos. (Default: `no`) +- `forums`: Whether to download forum threads. (Default: `no`) - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: `20.0`) From e69b55b3496d58bc19d76429ca0078ab10f23074 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Fri, 4 Nov 2022 12:18:26 +0100 Subject: [PATCH 105/224] Add more unofficial package managers (#66) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ce917b0..31a3475 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. Unofficial packages are available for: - [AUR](https://aur.archlinux.org/packages/pferd) +- [brew](https://formulae.brew.sh/formula/pferd) +- [conda-forge](https://github.com/conda-forge/pferd-feedstock) - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) +- [PyPi](https://pypi.org/project/pferd) See also PFERD's [repology page](https://repology.org/project/pferd/versions). From 635caa765decd9a747d8b313252fd6b56cea0951 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 15 Nov 2022 17:17:55 +0100 Subject: [PATCH 106/224] Fix typo Thanks, burg113 --- CONFIG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONFIG.md b/CONFIG.md index 1ca43c4..640e4af 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -290,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`. Example: `foo/bar --> baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Converts `foo/bar` into `baz` -- Converts `foo/bar/wargl` into `bar/wargl` +- Converts `foo/bar/wargl` into `baz/wargl` Example: `foo/bar --> !` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` From c0d6d8b22975234b0c9141a22307c8036698566c Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 21 Nov 2022 17:53:30 +0100 Subject: [PATCH 107/224] Use url after redirect for relative links --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 27 ++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5e81d6..5bbefd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ ambiguous situations. ### Changed - Clear up error message shown when multiple paths are found to an element +### Fixed +- IPD crawler unnecessarily appending trailing slashes + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 338e059..c852be0 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,7 +2,7 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from typing import Awaitable, List, Optional, Pattern, Set, Union +from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -24,9 +24,6 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") - if not target.endswith("/"): - target = target + "/" - return target def link_regex(self) -> Pattern[str]: @@ -102,32 +99,32 @@ class KitIpdCrawler(HttpCrawler): await self._stream_from_url(file.url, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: - page = await self.get_page() + page, url = await self.get_page() elements: List[Tag] = self._find_file_links(page) items: Set[Union[KitIpdFile, KitIpdFolder]] = set() for element in elements: folder_label = self._find_folder_label(element) if folder_label: - folder = self._extract_folder(folder_label) + folder = self._extract_folder(folder_label, url) if folder not in items: items.add(folder) folder.explain() else: - file = self._extract_file(element) + file = self._extract_file(element, url) items.add(file) log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items - def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: files: List[KitIpdFile] = [] name = folder_tag.getText().strip() container: Tag = folder_tag.findNextSibling(name="table") for link in self._find_file_links(container): - files.append(self._extract_file(link)) + files.append(self._extract_file(link, url)) return KitIpdFolder(name, files) @@ -138,16 +135,16 @@ class KitIpdCrawler(HttpCrawler): return None return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) - def _extract_file(self, link: Tag) -> KitIpdFile: - url = self._abs_url_from_link(link) + def _extract_file(self, link: Tag, url: str) -> KitIpdFile: + url = self._abs_url_from_link(url, link) name = os.path.basename(url) return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: return tag.findAll(name="a", attrs={"href": self._file_regex}) - def _abs_url_from_link(self, link_tag: Tag) -> str: - return urljoin(self._url, link_tag.get("href")) + def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: + return urljoin(url, link_tag.get("href")) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: @@ -162,7 +159,7 @@ class KitIpdCrawler(HttpCrawler): sink.done() - async def get_page(self) -> BeautifulSoup: + async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some # weird comments that beautifulsoup doesn't parse correctly. This @@ -170,4 +167,4 @@ class KitIpdCrawler(HttpCrawler): # cause issues on other pages. content = (await request.read()).decode("utf-8") content = re.sub(r"", "", content) - return soupify(content.encode("utf-8")) + return soupify(content.encode("utf-8")), str(request.url) From 55a2de6b88bbd2ee0cb031271e7045f53caa1702 Mon Sep 17 00:00:00 2001 From: c0derMo Date: Fri, 25 Nov 2022 10:25:22 +0000 Subject: [PATCH 108/224] Fix crawling English opencast --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbefd4..1dc5abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Fixed - IPD crawler unnecessarily appending trailing slashes +- Crawling opencast when ILIAS is set to English ## 3.4.2 - 2022-10-26 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 56dcf7b..c0ebdc9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -366,7 +366,7 @@ class IliasPage: """ # Video start links are marked with an "Abspielen" link video_links: List[Tag] = self._soup.findAll( - name="a", text=re.compile(r"\s*Abspielen\s*") + name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") ) results: List[IliasPageElement] = [] From 6d44aac2783c69031e7686263fc0a2285912376f Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 29 Nov 2022 18:22:19 +0100 Subject: [PATCH 109/224] Bump version to 3.4.3 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dc5abc..8793d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.3 - 2022-11-29 + ### Added - Missing documentation for `forums` option diff --git a/PFERD/version.py b/PFERD/version.py index 0ef5d89..7043d78 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.2" +VERSION = "3.4.3" From 722d2eb393913e770aff17da6b5b3b6603d1ee67 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 25 Nov 2022 12:49:36 +0100 Subject: [PATCH 110/224] Fix crawling of courses with preselected timeline tab --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8793d43..b1d18cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Crawling of courses with the timeline view as the default tab + ## 3.4.3 - 2022-11-29 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index c0ebdc9..44e44d9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -158,6 +158,8 @@ class IliasPage: if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() + if not self._is_content_tab_selected(): + return self._select_content_page_url() return None def _is_forum_page(self) -> bool: @@ -220,6 +222,27 @@ class IliasPage: link = self._abs_url_from_link(element) return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _is_content_tab_selected(self) -> bool: + return self._select_content_page_url() is None + + def _select_content_page_url(self) -> Optional[IliasPageElement]: + tab = self._soup.find( + id="tab_view_content", + attrs={"class": lambda x: x is not None and "active" not in x} + ) + # Already selected (or not found) + if not tab: + return None + link = tab.find("a") + if link: + link = self._abs_url_from_link(link) + return IliasPageElement(IliasElementType.FOLDER, link, "select content page") + + _unexpected_html_warning() + log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") + log.warn_contd("PFERD might not find content on the course's main page.") + return None + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere From 467fc526e8411d4a5113dbb78747aa119981c476 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:24 +0100 Subject: [PATCH 111/224] Fix crawling of file/video cards --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d18cd..c27059b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab +- Crawling of file and custom opencast cards ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 44e44d9..079cfd6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -738,7 +738,7 @@ class IliasPage: icon: Tag = card_root.select_one(".il-card-repository-head .icon") - if "opencast" in icon["class"]: + if "opencast" in icon["class"] or "xoct" in icon["class"]: return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE @@ -758,6 +758,8 @@ class IliasPage: return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY + if "file" in icon["class"]: + return IliasElementType.FILE _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") From 6f30c6583d6512c92042c581e86027a4341ddc89 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:33 +0100 Subject: [PATCH 112/224] Fix crawling of cards without descriptions --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c27059b..7a5f654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards +- Crawling of button cards without descriptions ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 079cfd6..efe6757 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -708,7 +708,11 @@ class IliasPage: "div", attrs={"class": lambda x: x and "caption" in x}, ) - description = caption_parent.find_next_sibling("div").getText().strip() + caption_container = caption_parent.find_next_sibling("div") + if caption_container: + description = caption_container.getText().strip() + else: + description = None if not type: _unexpected_html_warning() From 0294ceb7d5ff074dcc2566872d6b5f64f99c598f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 22 Mar 2023 00:08:19 +0100 Subject: [PATCH 113/224] Update github action versions --- .github/workflows/build-and-release.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 090ac7e..83a36e4 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -17,9 +17,9 @@ jobs: python: ["3.9"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -45,7 +45,7 @@ jobs: run: mv dist/pferd* dist/pferd-${{ matrix.os }} - name: Upload binary - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Binaries path: dist/pferd-${{ matrix.os }} @@ -57,7 +57,7 @@ jobs: steps: - name: Download binaries - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: Binaries From 443f7fe83913bcb82a42d7b70d4d05df65f05278 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" Date: Sat, 29 Jul 2023 17:54:42 +0200 Subject: [PATCH 114/224] Add `no-delete-prompt-overwrite` crawler conflict resolution option (#75) --- CHANGELOG.md | 3 +++ CONFIG.md | 2 ++ LICENSE | 3 ++- PFERD/output_dir.py | 11 ++++++----- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a5f654..22522e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ ambiguous situations. - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +### Added +- `no-delete-prompt-override` conflict resolution strategy + ## 3.4.3 - 2022-11-29 ### Added diff --git a/CONFIG.md b/CONFIG.md index 640e4af..84ee885 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -75,6 +75,8 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the + remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/LICENSE b/LICENSE index fe2293f..d81e827 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, - TheChristophe, Scriptim, thelukasprobst, Toorero + TheChristophe, Scriptim, thelukasprobst, Toorero, + Mr-Pine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c92f4a6..38d1288 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -44,6 +44,7 @@ class OnConflict(Enum): LOCAL_FIRST = "local-first" REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" @staticmethod def from_string(string: str) -> "OnConflict": @@ -51,7 +52,7 @@ class OnConflict(Enum): return OnConflict(string) except ValueError: raise ValueError("must be one of 'prompt', 'local-first'," - " 'remote-first', 'no-delete'") + " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") @dataclass @@ -264,7 +265,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Replace {fmt_path(path)} with remote file?" return await prompt_yes_no(prompt, default=False) @@ -283,7 +284,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" return await prompt_yes_no(prompt, default=False) @@ -303,7 +304,7 @@ class OutputDirectory: path: PurePath, parent: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" return await prompt_yes_no(prompt, default=False) @@ -330,7 +331,7 @@ class OutputDirectory: return False elif on_conflict == OnConflict.REMOTE_FIRST: return True - elif on_conflict == OnConflict.NO_DELETE: + elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: return False # This should never be reached From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 2 Jun 2023 18:19:39 +0200 Subject: [PATCH 115/224] Detect unexpected root page redirects and abort operation --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 10 ++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22522e2..ee55659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ ambiguous situations. - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +- Abort crawling when encountering an unexpected ilias root page redirect ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index efe6757..aed2069 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -79,6 +79,16 @@ class IliasPage: self._page_type = source_element.type if source_element else None self._source_name = source_element.name if source_element else "" + @staticmethod + def is_root_page(soup: BeautifulSoup) -> bool: + permalink = soup.find(id="current_perma_link") + if permalink is None: + return False + value = permalink.attrs.get("value") + if value is None: + return False + return "goto.php?target=root_" in value + def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index e3719b8..ae49edc 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ instance's greatest bottleneck. # Duplicated code, but the root page is special - we want to avoid fetching it twice! while next_stage_url: - soup = await self._get_page(next_stage_url) + soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") @@ -739,12 +739,12 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() - async def _get_page(self, url: str) -> BeautifulSoup: + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that await self.authenticate(auth_id) @@ -753,9 +753,21 @@ instance's greatest bottleneck. async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) raise CrawlError("get_page failed even after authenticating") + def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + if IliasPage.is_root_page(soup) and not root_page_allowed: + raise CrawlError( + "Unexpectedly encountered ILIAS root page. " + "This usually happens because the ILIAS instance is broken. " + "If so, wait a day or two and try again. " + "It could also happen because a crawled element links to the ILIAS root page. " + "If so, use a transform with a ! as target to ignore the particular element. " + f"The redirect came from {url}" + ) + return soup + async def _post_authenticated( self, url: str, From 123a57beec37090310f76df3746e6ce107ceb299 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 18:14:57 +0200 Subject: [PATCH 116/224] Fix mypy unreachable error in file_templates --- PFERD/crawl/ilias/file_templates.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 151a41b..59123a2 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -102,24 +102,24 @@ class Links(Enum): INTERNET_SHORTCUT = "internet-shortcut" def template(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return _link_template_fancy - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return _link_template_plain - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return _link_template_internet_shortcut - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") def extension(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return ".html" - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return ".txt" - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return ".url" - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") From 68c398f1fea5cfefd86d11e79f2f6582d50e6563 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 23:23:10 +0200 Subject: [PATCH 117/224] Add support for ILIAS learning modules --- CHANGELOG.md | 1 + PFERD/crawl/ilias/file_templates.py | 69 +++++++++ PFERD/crawl/ilias/ilias_html_cleaner.py | 2 +- PFERD/crawl/ilias/kit_ilias_html.py | 46 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 160 ++++++++++++++++++++- 5 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee55659..6e3925c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy +- support for ILIAS learning modules ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 59123a2..b206461 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,6 +1,10 @@ from enum import Enum from typing import Optional +import bs4 + +from PFERD.utils import soupify + _link_template_plain = "{{link}}" _link_template_fancy = """ @@ -94,6 +98,71 @@ _link_template_internet_shortcut = """ URL={{link}} """.strip() +_learning_module_template = """ + + + + + {{name}} + + + + +{{body}} + + +""" + + +def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: + # Seems to be comments, ignore those. + for elem in body.select(".il-copg-mob-fullscreen-modal"): + elem.decompose() + + nav_template = """ + + """ + if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): + text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + left = f'{text}' + else: + left = "" + + if next and body.select_one(".ilc_page_rnav_RightNavigation"): + text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + right = f'{text}' + else: + right = "" + + if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): + top_nav.replace_with( + soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) + ) + + if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): + bot_nav.replace_with(soupify(nav_template.replace( + "{{left}}", left).replace("{{right}}", right).encode()) + ) + + body = body.prettify() + return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + class Links(Enum): IGNORE = "ignore" diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5952309..5495304 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: dummy.decompose() if len(children) > 1: continue - if type(children[0]) == Comment: + if isinstance(type(children[0]), Comment): dummy.decompose() for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aed2069..46a8073 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" @@ -71,6 +72,14 @@ class IliasForumThread: mtime: Optional[datetime] +@dataclass +class IliasLearningModulePage: + title: str + content: Tag + next_url: Optional[str] + previous_url: Optional[str] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -136,6 +145,34 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: + if not self._is_learning_module_page(): + return None + content = self._soup.select_one("#ilLMPageContent") + title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + return IliasLearningModulePage( + title=title, + content=content, + next_url=self._find_learning_module_next(), + previous_url=self._find_learning_module_prev() + ) + + def _find_learning_module_next(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + + def _find_learning_module_prev(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) if not form: @@ -222,6 +259,12 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _is_learning_module_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=pg_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -812,6 +855,9 @@ class IliasPage: if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST + if "baseClass=ilLMPresentationGUI" in parsed_url.query: + return IliasElementType.LEARNING_MODULE + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae49edc..f82d684 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,8 +1,11 @@ import asyncio +import base64 +import os import re from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast +from urllib.parse import urljoin import aiohttp import yarl @@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links +from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, - _sanitize_path_name, parse_ilias_forum_export) +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -394,6 +397,8 @@ instance's greatest bottleneck. "[bright_black](surveys contain no relevant data)" ) return None + elif element.type == IliasElementType.LEARNING_MODULE: + return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: @@ -739,6 +744,135 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() + async def _handle_learning_module( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_learning_module(element, maybe_cl) + + @_iorepeat(3, "crawling learning module") + @anoncritical + async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements: List[IliasLearningModulePage] = [] + + async with cl: + log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {element.url}") + soup = await self._get_page(element.url) + page = IliasPage(soup, element.url, None) + if next := page.get_learning_module_data(): + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.previous_url, "left" + )) + elements.append(next) + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.next_url, "right" + )) + + # Reflect their natural ordering in the file names + for index, lm_element in enumerate(elements): + lm_element.title = f"{index:02}_{lm_element.title}" + + tasks: List[Awaitable[None]] = [] + for index, elem in enumerate(elements): + prev_url = elements[index - 1].title if index > 0 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None + tasks.append(asyncio.create_task( + self._download_learning_module_page(cl.path, elem, prev_url, next_url) + )) + + # And execute them + await self.gather(tasks) + + async def _crawl_learning_module_direction( + self, + path: PurePath, + start_url: Optional[str], + dir: Union[Literal["left"], Literal["right"]] + ) -> List[IliasLearningModulePage]: + elements: List[IliasLearningModulePage] = [] + + if not start_url: + return elements + + next_element_url: Optional[str] = start_url + counter = 0 + while next_element_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") + log.explain(f"URL: {next_element_url}") + soup = await self._get_page(next_element_url) + page = IliasPage(soup, next_element_url, None) + if next := page.get_learning_module_data(): + elements.append(next) + if dir == "left": + next_element_url = next.previous_url + else: + next_element_url = next.next_url + counter += 1 + + return elements + + @anoncritical + @_iorepeat(3, "saving learning module page") + async def _download_learning_module_page( + self, + parent_path: PurePath, + element: IliasLearningModulePage, + prev: Optional[str], + next: Optional[str] + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path) + if not maybe_dl: + return + my_path = self._transformer.transform(maybe_dl.path) + if not my_path: + return + + if prev: + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + if prev_p: + prev = os.path.relpath(prev_p, my_path.parent) + else: + prev = None + if next: + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + if next_p: + next = os.path.relpath(next_p, my_path.parent) + else: + next = None + + async with maybe_dl as (bar, sink): + content = element.content + content = await self.internalize_images(content) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.done() + + async def internalize_images(self, tag: Tag) -> Tag: + """ + Tries to fetch ILIAS images and embed them as base64 data. + """ + log.explain_topic("Internalizing images") + for elem in tag.find_all(recursive=True): + if not isinstance(elem, Tag): + continue + if elem.name == "img": + if src := elem.attrs.get("src", None): + url = urljoin(_ILIAS_URL, src) + if not url.startswith(_ILIAS_URL): + continue + log.explain(f"Internalizing {url!r}") + img = await self._get_authenticated(url) + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() + if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + # For unknown reasons the protocol seems to be stripped. + elem.attrs["src"] = "https:" + elem.attrs["src"] + return tag + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -772,7 +906,7 @@ instance's greatest bottleneck. self, url: str, data: dict[str, Union[str, List[str]]] - ) -> BeautifulSoup: + ) -> bytes: auth_id = await self._current_auth_id() form_data = aiohttp.FormData() @@ -792,6 +926,22 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("post_authenticated failed even after authenticating") + async def _get_authenticated(self, url: str) -> bytes: + auth_id = await self._current_auth_id() + + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("get_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. @ _iorepeat(3, "Login", failure_is_error=True) From dbc2553b119c39c7a8ad196c6858fc8109f746a9 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" <50425705+Mr-Pine@users.noreply.github.com> Date: Wed, 15 Mar 2023 15:33:42 +0100 Subject: [PATCH 118/224] Add default `show-not-deleted` option If set to `no`, PFERD won't print status or report messages for not deleted files --- CHANGELOG.md | 3 +++ CONFIG.md | 8 ++++++-- PFERD/__main__.py | 4 ++++ PFERD/cli/parser.py | 7 +++++++ PFERD/config.py | 3 +++ PFERD/logging.py | 20 ++++++++++++++++++++ PFERD/output_dir.py | 2 +- PFERD/pferd.py | 2 +- 8 files changed, 45 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e3925c..85513d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,9 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy - support for ILIAS learning modules +- `show_not_deleted` option to stop printing the "Not Deleted" status or report + message. This combines nicely with the `no-delete-prompt-override` strategy, + causing PFERD to mostly ignore local-only files. ## 3.4.3 - 2022-11-29 diff --git a/CONFIG.md b/CONFIG.md index 84ee885..5f62749 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -26,6 +26,9 @@ default values for the other sections. `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `show_not_deleted`: Whether PFERD should print messages in status and report + when a local-only file wasn't deleted. Combines nicely with the + `no-delete-prompt-override` conflict resolution strategy. - `share_cookies`: Whether crawlers should share cookies where applicable. For example, some crawlers share cookies if they crawl the same website using the same account. (Default: `yes`) @@ -75,8 +78,9 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. - - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the - remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to + overwrite local files if the remote file is different. Combines nicely + with the `show_not_deleted` option. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 4faeb13..cb8c67c 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -47,6 +47,8 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: log.output_explain = args.explain if args.status is not None: log.output_status = args.status + if args.show_not_deleted is not None: + log.output_not_deleted = args.show_not_deleted if args.report is not None: log.output_report = args.report @@ -72,6 +74,8 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N log.output_status = config.default_section.status() if args.report is None: log.output_report = config.default_section.report() + if args.show_not_deleted is None: + log.output_not_deleted = config.default_section.show_not_deleted() except ConfigOptionError as e: log.error(str(e)) sys.exit(1) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index e753023..be483fd 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -215,6 +215,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="whether crawlers should share cookies where applicable" ) +PARSER.add_argument( + "--show-not-deleted", + action=BooleanOptionalAction, + help="print messages in status and report when PFERD did not delete a local only file" +) def load_default_section( @@ -233,6 +238,8 @@ def load_default_section( section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" + if args.show_not_deleted is not None: + section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 8f7e682..b2cff4e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -82,6 +82,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def show_not_deleted(self) -> bool: + return self.s.getboolean("show_not_deleted", fallback=True) + def share_cookies(self) -> bool: return self.s.getboolean("share_cookies", fallback=True) diff --git a/PFERD/logging.py b/PFERD/logging.py index 340b21f..b958fb2 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -59,6 +59,7 @@ class Log: # Whether different parts of the output are enabled or disabled self.output_explain = False self.output_status = True + self.output_not_deleted = True self.output_report = True def _update_live(self) -> None: @@ -207,6 +208,17 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new action = escape(f"{action:<{self.STATUS_WIDTH}}") self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: + """ + Print a message for a local only file that wasn't + deleted while crawling. Allows markup in the "style" + argument which will be applied to the "action" string. + """ + + if self.output_status and self.output_not_deleted: + action = escape(f"{action:<{self.STATUS_WIDTH}}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def report(self, text: str) -> None: """ Print a report after crawling. Allows markup. @@ -215,6 +227,14 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_report: self.print(text) + def report_not_deleted(self, text: str) -> None: + """ + Print a report for a local only file that wasn't deleted after crawling. Allows markup. + """ + + if self.output_report and self.output_not_deleted: + self.print(text) + @contextmanager def _bar( self, diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 38d1288..e9e9b93 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -496,7 +496,7 @@ class OutputDirectory: except OSError: pass else: - log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) + log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) self._report.not_delete_file(pure) def load_prev_report(self) -> None: diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 079053b..b30a04a 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -180,7 +180,7 @@ class Pferd: log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") for path in sorted(crawler.report.not_deleted_files): something_changed = True - log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + log.report_not_deleted(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") for warning in crawler.report.encountered_warnings: something_changed = True From b3d412360baeed6992535e6957d0bc1e368c337f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 23:48:14 +0200 Subject: [PATCH 119/224] Add Nix flake --- flake.lock | 27 +++++++++++++++++++++++++++ flake.nix | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..914c58b --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1692986144, + "narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-23.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..e3d52af --- /dev/null +++ b/flake.nix @@ -0,0 +1,41 @@ +{ + description = "Tool for downloading course-related files from ILIAS"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; + }; + + outputs = { self, nixpkgs }: + let + # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. + forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; + in + { + packages = forAllSystems (system: + let pkgs = import nixpkgs { inherit system; }; + in + rec { + default = pkgs.python3Packages.buildPythonApplication rec { + pname = "pferd"; + # Performing black magic + # Don't worry, I sacrificed enough goats for the next few years + version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; + format = "pyproject"; + + src = ./.; + + nativeBuildInputs = with pkgs.python3Packages; [ + setuptools + ]; + + propagatedBuildInputs = with pkgs.python3Packages; [ + aiohttp + beautifulsoup4 + rich + keyring + certifi + ]; + }; + }); + }; +} From 2184ac804018e836e439e365ae2b0d184adae26d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 19:39:40 +0200 Subject: [PATCH 120/224] Add support for ILIAS mediacast listings --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 110 +++++++++++++++------ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 45 +++++---- 3 files changed, 107 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85513d2..d58ea18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. - `show_not_deleted` option to stop printing the "Not Deleted" status or report message. This combines nicely with the `no-delete-prompt-override` strategy, causing PFERD to mostly ignore local-only files. +- support for mediacast video listings ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 46a8073..d5ea76d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -26,10 +26,12 @@ class IliasElementType(Enum): BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" - VIDEO = "video" - VIDEO_PLAYER = "video_player" - VIDEO_FOLDER = "video_folder" - VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + MEDIACAST_VIDEO = "mediacast_video" + OPENCAST_VIDEO = "opencast_video" + OPENCAST_VIDEO_PLAYER = "opencast_video_player" + OPENCAST_VIDEO_FOLDER = "opencast_video_folder" + OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @dataclass @@ -45,7 +47,8 @@ class IliasPageElement: r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)", - r"target=[a-z]+_(?P\d+)" + r"target=[a-z]+_(?P\d+)", + r"mm_(?P\d+)" ] for regex in regexes: @@ -105,9 +108,9 @@ class IliasPage: if self._is_video_player(): log.explain("Page is a video player, extracting URL") return self._player_to_video() - if self._is_video_listing(): - log.explain("Page is a video listing, searching for elements") - return self._find_video_entries() + if self._is_opencast_video_listing(): + log.explain("Page is an opencast video listing, searching for elements") + return self._find_opencast_video_entries() if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() @@ -199,9 +202,9 @@ class IliasPage: if self._is_ilias_opencast_embedding(): log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] - if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED: log.explain("Unwrapping video pagination") - return self._find_video_entries_paginated()[0] + return self._find_opencast_video_entries_paginated()[0] if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() @@ -219,7 +222,7 @@ class IliasPage: def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) - def _is_video_listing(self) -> bool: + def _is_opencast_video_listing(self) -> bool: if self._is_ilias_opencast_embedding(): return True @@ -319,14 +322,14 @@ class IliasPage: # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] - items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) return items @@ -385,7 +388,7 @@ class IliasPage: return items - def _find_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -405,27 +408,27 @@ class IliasPage: query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) log.explain("Found ILIAS video frame page, fetching actual content next") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None - if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER: # We are in stage 2 - try to break pagination - return self._find_video_entries_paginated() + return self._find_opencast_video_entries_paginated() - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() - def _find_video_entries_paginated(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() table_id = id_match.group(1) @@ -434,9 +437,9 @@ class IliasPage: url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ @@ -448,11 +451,11 @@ class IliasPage: results: List[IliasPageElement] = [] for link in video_links: - results.append(self._listed_video_to_element(link)) + results.append(self._listed_opencast_video_to_element(link)) return results - def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date @@ -479,7 +482,9 @@ class IliasPage: video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") - return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + return IliasPageElement( + IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time + ) def _find_exercise_entries(self) -> List[IliasPageElement]: if self._soup.find(id="tab_submission"): @@ -622,9 +627,48 @@ class IliasPage: result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) result += self._find_cards() + result += self._find_mediacast_videos() return result + def _find_mediacast_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + element_name = _sanitize_path_name( + elem.select_one(".ilPlayerPreviewDescription").getText().strip() + ) + if not element_name.endswith(".mp4"): + # just to make sure it has some kinda-alrightish ending + element_name = element_name + ".mp4" + video_element = elem.find(name="video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No