From 722d2eb393913e770aff17da6b5b3b6603d1ee67 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 25 Nov 2022 12:49:36 +0100 Subject: [PATCH 001/115] Fix crawling of courses with preselected timeline tab --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8793d43..b1d18cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Crawling of courses with the timeline view as the default tab + ## 3.4.3 - 2022-11-29 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index c0ebdc9..44e44d9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -158,6 +158,8 @@ class IliasPage: if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() + if not self._is_content_tab_selected(): + return self._select_content_page_url() return None def _is_forum_page(self) -> bool: @@ -220,6 +222,27 @@ class IliasPage: link = self._abs_url_from_link(element) return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _is_content_tab_selected(self) -> bool: + return self._select_content_page_url() is None + + def _select_content_page_url(self) -> Optional[IliasPageElement]: + tab = self._soup.find( + id="tab_view_content", + attrs={"class": lambda x: x is not None and "active" not in x} + ) + # Already selected (or not found) + if not tab: + return None + link = tab.find("a") + if link: + link = self._abs_url_from_link(link) + return IliasPageElement(IliasElementType.FOLDER, link, "select content page") + + _unexpected_html_warning() + log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") + log.warn_contd("PFERD might not find content on the course's main page.") + return None + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere From 467fc526e8411d4a5113dbb78747aa119981c476 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:24 +0100 Subject: [PATCH 002/115] Fix crawling of file/video cards --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d18cd..c27059b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab +- Crawling of file and custom opencast cards ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 44e44d9..079cfd6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -738,7 +738,7 @@ class IliasPage: icon: Tag = card_root.select_one(".il-card-repository-head .icon") - if "opencast" in icon["class"]: + if "opencast" in icon["class"] or "xoct" in icon["class"]: return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE @@ -758,6 +758,8 @@ class IliasPage: return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY + if "file" in icon["class"]: + return IliasElementType.FILE _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") From 6f30c6583d6512c92042c581e86027a4341ddc89 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:33 +0100 Subject: [PATCH 003/115] Fix crawling of cards without descriptions --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c27059b..7a5f654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards +- Crawling of button cards without descriptions ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 079cfd6..efe6757 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -708,7 +708,11 @@ class IliasPage: "div", attrs={"class": lambda x: x and "caption" in x}, ) - description = caption_parent.find_next_sibling("div").getText().strip() + caption_container = caption_parent.find_next_sibling("div") + if caption_container: + description = caption_container.getText().strip() + else: + description = None if not type: _unexpected_html_warning() From 0294ceb7d5ff074dcc2566872d6b5f64f99c598f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 22 Mar 2023 00:08:19 +0100 Subject: [PATCH 004/115] Update github action versions --- .github/workflows/build-and-release.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 090ac7e..83a36e4 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -17,9 +17,9 @@ jobs: python: ["3.9"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -45,7 +45,7 @@ jobs: run: mv dist/pferd* dist/pferd-${{ matrix.os }} - name: Upload binary - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Binaries path: dist/pferd-${{ matrix.os }} @@ -57,7 +57,7 @@ jobs: steps: - name: Download binaries - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: Binaries From 443f7fe83913bcb82a42d7b70d4d05df65f05278 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" Date: Sat, 29 Jul 2023 17:54:42 +0200 Subject: [PATCH 005/115] Add `no-delete-prompt-overwrite` crawler conflict resolution option (#75) --- CHANGELOG.md | 3 +++ CONFIG.md | 2 ++ LICENSE | 3 ++- PFERD/output_dir.py | 11 ++++++----- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a5f654..22522e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ ambiguous situations. - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +### Added +- `no-delete-prompt-override` conflict resolution strategy + ## 3.4.3 - 2022-11-29 ### Added diff --git a/CONFIG.md b/CONFIG.md index 640e4af..84ee885 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -75,6 +75,8 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the + remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/LICENSE b/LICENSE index fe2293f..d81e827 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, - TheChristophe, Scriptim, thelukasprobst, Toorero + TheChristophe, Scriptim, thelukasprobst, Toorero, + Mr-Pine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c92f4a6..38d1288 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -44,6 +44,7 @@ class OnConflict(Enum): LOCAL_FIRST = "local-first" REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" @staticmethod def from_string(string: str) -> "OnConflict": @@ -51,7 +52,7 @@ class OnConflict(Enum): return OnConflict(string) except ValueError: raise ValueError("must be one of 'prompt', 'local-first'," - " 'remote-first', 'no-delete'") + " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") @dataclass @@ -264,7 +265,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Replace {fmt_path(path)} with remote file?" return await prompt_yes_no(prompt, default=False) @@ -283,7 +284,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" return await prompt_yes_no(prompt, default=False) @@ -303,7 +304,7 @@ class OutputDirectory: path: PurePath, parent: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" return await prompt_yes_no(prompt, default=False) @@ -330,7 +331,7 @@ class OutputDirectory: return False elif on_conflict == OnConflict.REMOTE_FIRST: return True - elif on_conflict == OnConflict.NO_DELETE: + elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: return False # This should never be reached From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 2 Jun 2023 18:19:39 +0200 Subject: [PATCH 006/115] Detect unexpected root page redirects and abort operation --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 10 ++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22522e2..ee55659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ ambiguous situations. - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +- Abort crawling when encountering an unexpected ilias root page redirect ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index efe6757..aed2069 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -79,6 +79,16 @@ class IliasPage: self._page_type = source_element.type if source_element else None self._source_name = source_element.name if source_element else "" + @staticmethod + def is_root_page(soup: BeautifulSoup) -> bool: + permalink = soup.find(id="current_perma_link") + if permalink is None: + return False + value = permalink.attrs.get("value") + if value is None: + return False + return "goto.php?target=root_" in value + def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index e3719b8..ae49edc 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ instance's greatest bottleneck. # Duplicated code, but the root page is special - we want to avoid fetching it twice! while next_stage_url: - soup = await self._get_page(next_stage_url) + soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") @@ -739,12 +739,12 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() - async def _get_page(self, url: str) -> BeautifulSoup: + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that await self.authenticate(auth_id) @@ -753,9 +753,21 @@ instance's greatest bottleneck. async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) raise CrawlError("get_page failed even after authenticating") + def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + if IliasPage.is_root_page(soup) and not root_page_allowed: + raise CrawlError( + "Unexpectedly encountered ILIAS root page. " + "This usually happens because the ILIAS instance is broken. " + "If so, wait a day or two and try again. " + "It could also happen because a crawled element links to the ILIAS root page. " + "If so, use a transform with a ! as target to ignore the particular element. " + f"The redirect came from {url}" + ) + return soup + async def _post_authenticated( self, url: str, From 123a57beec37090310f76df3746e6ce107ceb299 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 18:14:57 +0200 Subject: [PATCH 007/115] Fix mypy unreachable error in file_templates --- PFERD/crawl/ilias/file_templates.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 151a41b..59123a2 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -102,24 +102,24 @@ class Links(Enum): INTERNET_SHORTCUT = "internet-shortcut" def template(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return _link_template_fancy - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return _link_template_plain - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return _link_template_internet_shortcut - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") def extension(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return ".html" - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return ".txt" - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return ".url" - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") From 68c398f1fea5cfefd86d11e79f2f6582d50e6563 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 23:23:10 +0200 Subject: [PATCH 008/115] Add support for ILIAS learning modules --- CHANGELOG.md | 1 + PFERD/crawl/ilias/file_templates.py | 69 +++++++++ PFERD/crawl/ilias/ilias_html_cleaner.py | 2 +- PFERD/crawl/ilias/kit_ilias_html.py | 46 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 160 ++++++++++++++++++++- 5 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee55659..6e3925c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy +- support for ILIAS learning modules ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 59123a2..b206461 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,6 +1,10 @@ from enum import Enum from typing import Optional +import bs4 + +from PFERD.utils import soupify + _link_template_plain = "{{link}}" _link_template_fancy = """ @@ -94,6 +98,71 @@ _link_template_internet_shortcut = """ URL={{link}} """.strip() +_learning_module_template = """ + + + + + {{name}} + + + + +{{body}} + + +""" + + +def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: + # Seems to be comments, ignore those. + for elem in body.select(".il-copg-mob-fullscreen-modal"): + elem.decompose() + + nav_template = """ + + """ + if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): + text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + left = f'{text}' + else: + left = "" + + if next and body.select_one(".ilc_page_rnav_RightNavigation"): + text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + right = f'{text}' + else: + right = "" + + if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): + top_nav.replace_with( + soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) + ) + + if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): + bot_nav.replace_with(soupify(nav_template.replace( + "{{left}}", left).replace("{{right}}", right).encode()) + ) + + body = body.prettify() + return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + class Links(Enum): IGNORE = "ignore" diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5952309..5495304 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: dummy.decompose() if len(children) > 1: continue - if type(children[0]) == Comment: + if isinstance(type(children[0]), Comment): dummy.decompose() for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aed2069..46a8073 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" @@ -71,6 +72,14 @@ class IliasForumThread: mtime: Optional[datetime] +@dataclass +class IliasLearningModulePage: + title: str + content: Tag + next_url: Optional[str] + previous_url: Optional[str] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -136,6 +145,34 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: + if not self._is_learning_module_page(): + return None + content = self._soup.select_one("#ilLMPageContent") + title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + return IliasLearningModulePage( + title=title, + content=content, + next_url=self._find_learning_module_next(), + previous_url=self._find_learning_module_prev() + ) + + def _find_learning_module_next(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + + def _find_learning_module_prev(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) if not form: @@ -222,6 +259,12 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _is_learning_module_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=pg_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -812,6 +855,9 @@ class IliasPage: if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST + if "baseClass=ilLMPresentationGUI" in parsed_url.query: + return IliasElementType.LEARNING_MODULE + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae49edc..f82d684 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,8 +1,11 @@ import asyncio +import base64 +import os import re from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast +from urllib.parse import urljoin import aiohttp import yarl @@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links +from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, - _sanitize_path_name, parse_ilias_forum_export) +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -394,6 +397,8 @@ instance's greatest bottleneck. "[bright_black](surveys contain no relevant data)" ) return None + elif element.type == IliasElementType.LEARNING_MODULE: + return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: @@ -739,6 +744,135 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() + async def _handle_learning_module( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_learning_module(element, maybe_cl) + + @_iorepeat(3, "crawling learning module") + @anoncritical + async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements: List[IliasLearningModulePage] = [] + + async with cl: + log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {element.url}") + soup = await self._get_page(element.url) + page = IliasPage(soup, element.url, None) + if next := page.get_learning_module_data(): + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.previous_url, "left" + )) + elements.append(next) + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.next_url, "right" + )) + + # Reflect their natural ordering in the file names + for index, lm_element in enumerate(elements): + lm_element.title = f"{index:02}_{lm_element.title}" + + tasks: List[Awaitable[None]] = [] + for index, elem in enumerate(elements): + prev_url = elements[index - 1].title if index > 0 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None + tasks.append(asyncio.create_task( + self._download_learning_module_page(cl.path, elem, prev_url, next_url) + )) + + # And execute them + await self.gather(tasks) + + async def _crawl_learning_module_direction( + self, + path: PurePath, + start_url: Optional[str], + dir: Union[Literal["left"], Literal["right"]] + ) -> List[IliasLearningModulePage]: + elements: List[IliasLearningModulePage] = [] + + if not start_url: + return elements + + next_element_url: Optional[str] = start_url + counter = 0 + while next_element_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") + log.explain(f"URL: {next_element_url}") + soup = await self._get_page(next_element_url) + page = IliasPage(soup, next_element_url, None) + if next := page.get_learning_module_data(): + elements.append(next) + if dir == "left": + next_element_url = next.previous_url + else: + next_element_url = next.next_url + counter += 1 + + return elements + + @anoncritical + @_iorepeat(3, "saving learning module page") + async def _download_learning_module_page( + self, + parent_path: PurePath, + element: IliasLearningModulePage, + prev: Optional[str], + next: Optional[str] + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path) + if not maybe_dl: + return + my_path = self._transformer.transform(maybe_dl.path) + if not my_path: + return + + if prev: + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + if prev_p: + prev = os.path.relpath(prev_p, my_path.parent) + else: + prev = None + if next: + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + if next_p: + next = os.path.relpath(next_p, my_path.parent) + else: + next = None + + async with maybe_dl as (bar, sink): + content = element.content + content = await self.internalize_images(content) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.done() + + async def internalize_images(self, tag: Tag) -> Tag: + """ + Tries to fetch ILIAS images and embed them as base64 data. + """ + log.explain_topic("Internalizing images") + for elem in tag.find_all(recursive=True): + if not isinstance(elem, Tag): + continue + if elem.name == "img": + if src := elem.attrs.get("src", None): + url = urljoin(_ILIAS_URL, src) + if not url.startswith(_ILIAS_URL): + continue + log.explain(f"Internalizing {url!r}") + img = await self._get_authenticated(url) + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() + if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + # For unknown reasons the protocol seems to be stripped. + elem.attrs["src"] = "https:" + elem.attrs["src"] + return tag + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -772,7 +906,7 @@ instance's greatest bottleneck. self, url: str, data: dict[str, Union[str, List[str]]] - ) -> BeautifulSoup: + ) -> bytes: auth_id = await self._current_auth_id() form_data = aiohttp.FormData() @@ -792,6 +926,22 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("post_authenticated failed even after authenticating") + async def _get_authenticated(self, url: str) -> bytes: + auth_id = await self._current_auth_id() + + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("get_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. @ _iorepeat(3, "Login", failure_is_error=True) From dbc2553b119c39c7a8ad196c6858fc8109f746a9 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" <50425705+Mr-Pine@users.noreply.github.com> Date: Wed, 15 Mar 2023 15:33:42 +0100 Subject: [PATCH 009/115] Add default `show-not-deleted` option If set to `no`, PFERD won't print status or report messages for not deleted files --- CHANGELOG.md | 3 +++ CONFIG.md | 8 ++++++-- PFERD/__main__.py | 4 ++++ PFERD/cli/parser.py | 7 +++++++ PFERD/config.py | 3 +++ PFERD/logging.py | 20 ++++++++++++++++++++ PFERD/output_dir.py | 2 +- PFERD/pferd.py | 2 +- 8 files changed, 45 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e3925c..85513d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,9 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy - support for ILIAS learning modules +- `show_not_deleted` option to stop printing the "Not Deleted" status or report + message. This combines nicely with the `no-delete-prompt-override` strategy, + causing PFERD to mostly ignore local-only files. ## 3.4.3 - 2022-11-29 diff --git a/CONFIG.md b/CONFIG.md index 84ee885..5f62749 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -26,6 +26,9 @@ default values for the other sections. `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `show_not_deleted`: Whether PFERD should print messages in status and report + when a local-only file wasn't deleted. Combines nicely with the + `no-delete-prompt-override` conflict resolution strategy. - `share_cookies`: Whether crawlers should share cookies where applicable. For example, some crawlers share cookies if they crawl the same website using the same account. (Default: `yes`) @@ -75,8 +78,9 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. - - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the - remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to + overwrite local files if the remote file is different. Combines nicely + with the `show_not_deleted` option. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 4faeb13..cb8c67c 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -47,6 +47,8 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: log.output_explain = args.explain if args.status is not None: log.output_status = args.status + if args.show_not_deleted is not None: + log.output_not_deleted = args.show_not_deleted if args.report is not None: log.output_report = args.report @@ -72,6 +74,8 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N log.output_status = config.default_section.status() if args.report is None: log.output_report = config.default_section.report() + if args.show_not_deleted is None: + log.output_not_deleted = config.default_section.show_not_deleted() except ConfigOptionError as e: log.error(str(e)) sys.exit(1) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index e753023..be483fd 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -215,6 +215,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="whether crawlers should share cookies where applicable" ) +PARSER.add_argument( + "--show-not-deleted", + action=BooleanOptionalAction, + help="print messages in status and report when PFERD did not delete a local only file" +) def load_default_section( @@ -233,6 +238,8 @@ def load_default_section( section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" + if args.show_not_deleted is not None: + section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 8f7e682..b2cff4e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -82,6 +82,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def show_not_deleted(self) -> bool: + return self.s.getboolean("show_not_deleted", fallback=True) + def share_cookies(self) -> bool: return self.s.getboolean("share_cookies", fallback=True) diff --git a/PFERD/logging.py b/PFERD/logging.py index 340b21f..b958fb2 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -59,6 +59,7 @@ class Log: # Whether different parts of the output are enabled or disabled self.output_explain = False self.output_status = True + self.output_not_deleted = True self.output_report = True def _update_live(self) -> None: @@ -207,6 +208,17 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new action = escape(f"{action:<{self.STATUS_WIDTH}}") self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: + """ + Print a message for a local only file that wasn't + deleted while crawling. Allows markup in the "style" + argument which will be applied to the "action" string. + """ + + if self.output_status and self.output_not_deleted: + action = escape(f"{action:<{self.STATUS_WIDTH}}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def report(self, text: str) -> None: """ Print a report after crawling. Allows markup. @@ -215,6 +227,14 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_report: self.print(text) + def report_not_deleted(self, text: str) -> None: + """ + Print a report for a local only file that wasn't deleted after crawling. Allows markup. + """ + + if self.output_report and self.output_not_deleted: + self.print(text) + @contextmanager def _bar( self, diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 38d1288..e9e9b93 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -496,7 +496,7 @@ class OutputDirectory: except OSError: pass else: - log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) + log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) self._report.not_delete_file(pure) def load_prev_report(self) -> None: diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 079053b..b30a04a 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -180,7 +180,7 @@ class Pferd: log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") for path in sorted(crawler.report.not_deleted_files): something_changed = True - log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + log.report_not_deleted(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") for warning in crawler.report.encountered_warnings: something_changed = True From b3d412360baeed6992535e6957d0bc1e368c337f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 23:48:14 +0200 Subject: [PATCH 010/115] Add Nix flake --- flake.lock | 27 +++++++++++++++++++++++++++ flake.nix | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..914c58b --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1692986144, + "narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-23.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..e3d52af --- /dev/null +++ b/flake.nix @@ -0,0 +1,41 @@ +{ + description = "Tool for downloading course-related files from ILIAS"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; + }; + + outputs = { self, nixpkgs }: + let + # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. + forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; + in + { + packages = forAllSystems (system: + let pkgs = import nixpkgs { inherit system; }; + in + rec { + default = pkgs.python3Packages.buildPythonApplication rec { + pname = "pferd"; + # Performing black magic + # Don't worry, I sacrificed enough goats for the next few years + version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; + format = "pyproject"; + + src = ./.; + + nativeBuildInputs = with pkgs.python3Packages; [ + setuptools + ]; + + propagatedBuildInputs = with pkgs.python3Packages; [ + aiohttp + beautifulsoup4 + rich + keyring + certifi + ]; + }; + }); + }; +} From 2184ac804018e836e439e365ae2b0d184adae26d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 19:39:40 +0200 Subject: [PATCH 011/115] Add support for ILIAS mediacast listings --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 110 +++++++++++++++------ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 45 +++++---- 3 files changed, 107 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85513d2..d58ea18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. - `show_not_deleted` option to stop printing the "Not Deleted" status or report message. This combines nicely with the `no-delete-prompt-override` strategy, causing PFERD to mostly ignore local-only files. +- support for mediacast video listings ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 46a8073..d5ea76d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -26,10 +26,12 @@ class IliasElementType(Enum): BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" - VIDEO = "video" - VIDEO_PLAYER = "video_player" - VIDEO_FOLDER = "video_folder" - VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + MEDIACAST_VIDEO = "mediacast_video" + OPENCAST_VIDEO = "opencast_video" + OPENCAST_VIDEO_PLAYER = "opencast_video_player" + OPENCAST_VIDEO_FOLDER = "opencast_video_folder" + OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @dataclass @@ -45,7 +47,8 @@ class IliasPageElement: r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)", - r"target=[a-z]+_(?P\d+)" + r"target=[a-z]+_(?P\d+)", + r"mm_(?P\d+)" ] for regex in regexes: @@ -105,9 +108,9 @@ class IliasPage: if self._is_video_player(): log.explain("Page is a video player, extracting URL") return self._player_to_video() - if self._is_video_listing(): - log.explain("Page is a video listing, searching for elements") - return self._find_video_entries() + if self._is_opencast_video_listing(): + log.explain("Page is an opencast video listing, searching for elements") + return self._find_opencast_video_entries() if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() @@ -199,9 +202,9 @@ class IliasPage: if self._is_ilias_opencast_embedding(): log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] - if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED: log.explain("Unwrapping video pagination") - return self._find_video_entries_paginated()[0] + return self._find_opencast_video_entries_paginated()[0] if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() @@ -219,7 +222,7 @@ class IliasPage: def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) - def _is_video_listing(self) -> bool: + def _is_opencast_video_listing(self) -> bool: if self._is_ilias_opencast_embedding(): return True @@ -319,14 +322,14 @@ class IliasPage: # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] - items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) return items @@ -385,7 +388,7 @@ class IliasPage: return items - def _find_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -405,27 +408,27 @@ class IliasPage: query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) log.explain("Found ILIAS video frame page, fetching actual content next") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None - if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER: # We are in stage 2 - try to break pagination - return self._find_video_entries_paginated() + return self._find_opencast_video_entries_paginated() - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() - def _find_video_entries_paginated(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() table_id = id_match.group(1) @@ -434,9 +437,9 @@ class IliasPage: url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ @@ -448,11 +451,11 @@ class IliasPage: results: List[IliasPageElement] = [] for link in video_links: - results.append(self._listed_video_to_element(link)) + results.append(self._listed_opencast_video_to_element(link)) return results - def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date @@ -479,7 +482,9 @@ class IliasPage: video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") - return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + return IliasPageElement( + IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time + ) def _find_exercise_entries(self) -> List[IliasPageElement]: if self._soup.find(id="tab_submission"): @@ -622,9 +627,48 @@ class IliasPage: result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) result += self._find_cards() + result += self._find_mediacast_videos() return result + def _find_mediacast_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + element_name = _sanitize_path_name( + elem.select_one(".ilPlayerPreviewDescription").getText().strip() + ) + if not element_name.endswith(".mp4"): + # just to make sure it has some kinda-alrightish ending + element_name = element_name + ".mp4" + video_element = elem.find(name="video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No