From 266812f90ea7b33e2cd195ee6d34dc2ba53c4926 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 16 Nov 2023 10:34:49 +0100 Subject: [PATCH 01/96] Move is_logged_in helper to kit_ilias_html --- PFERD/crawl/ilias/kit_ilias_html.py | 28 +++++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 +++------------------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 2c37816..d23141f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -1067,6 +1067,34 @@ class IliasPage: rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name + @staticmethod + def is_logged_in(soup: BeautifulSoup) -> bool: + # Normal ILIAS pages + mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + if mainbar is not None: + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) + shib_login = soup.find(id="button_shib_login") + return not login_button and not shib_login + + # Personal Desktop + if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + return True + + # Video listing embeds do not have complete ILIAS html. Try to match them by + # their video listing table + video_table = soup.find( + recursive=True, + name="table", + attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} + ) + if video_table is not None: + return True + # The individual video player wrapper page has nothing of the above. + # Match it by its playerContainer. + if soup.select_one("#playerContainer") is not None: + return True + return False + def _abs_url_from_link(self, link_tag: Tag) -> str: """ Create an absolute url from an tag. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index d5f6809..94b7b9e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -894,7 +894,7 @@ instance's greatest bottleneck. auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) - if self._is_logged_in(soup): + if IliasPage.is_logged_in(soup): return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that @@ -903,11 +903,12 @@ instance's greatest bottleneck. # Retry once after authenticating. If this fails, we will die. async with self.session.get(url) as request: soup = soupify(await request.read()) - if self._is_logged_in(soup): + if IliasPage.is_logged_in(soup): return self._verify_page(soup, url, root_page_allowed) raise CrawlError(f"get_page failed even after authenticating on {url!r}") - def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + @staticmethod + def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: if IliasPage.is_root_page(soup) and not root_page_allowed: raise CrawlError( "Unexpectedly encountered ILIAS root page. " @@ -965,34 +966,6 @@ instance's greatest bottleneck. async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @ staticmethod - def _is_logged_in(soup: BeautifulSoup) -> bool: - # Normal ILIAS pages - mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") - if mainbar is not None: - login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) - shib_login = soup.find(id="button_shib_login") - return not login_button and not shib_login - - # Personal Desktop - if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): - return True - - # Video listing embeds do not have complete ILIAS html. Try to match them by - # their video listing table - video_table = soup.find( - recursive=True, - name="table", - attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")} - ) - if video_table is not None: - return True - # The individual video player wrapper page has nothing of the above. - # Match it by its playerContainer. - if soup.select_one("#playerContainer") is not None: - return True - return False - class KitShibbolethLogin: """ From e9f8901520356e23a7fe75c232e2abeb65e2d5a7 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 30 Nov 2023 20:50:53 +0100 Subject: [PATCH 02/96] Fix typos in ilias crawler and use set literals --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 94b7b9e..b9fb45a 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -81,7 +81,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): return self.s.getboolean("forums", fallback=False) -_DIRECTORY_PAGES: Set[IliasElementType] = set([ +_DIRECTORY_PAGES: Set[IliasElementType] = { IliasElementType.EXERCISE, IliasElementType.EXERCISE_FILES, IliasElementType.FOLDER, @@ -90,16 +90,16 @@ _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, -]) +} -_VIDEO_ELEMENTS: Set[IliasElementType] = set([ +_VIDEO_ELEMENTS: Set[IliasElementType] = { IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO, IliasElementType.OPENCAST_VIDEO, IliasElementType.OPENCAST_VIDEO_PLAYER, IliasElementType.OPENCAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, -]) +} def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Callable[[AWrapped], AWrapped]: @@ -561,8 +561,8 @@ instance's greatest bottleneck. # If we do not want to crawl it (user filter) or we have every file # from the cached mapping already, we can ignore this and bail if not maybe_dl or self._all_opencast_videos_locally_present(element_path): - # Mark all existing cideos as known so they do not get deleted - # during dleanup. We "downloaded" them, just without actually making + # Mark all existing videos as known so they do not get deleted + # during cleanup. We "downloaded" them, just without actually making # a network request as we assumed they did not change. for video in self._previous_contained_opencast_videos(element_path): await self.download(video) From a117126389a6298d04944ddbcda35f9b537e960b Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 9 Dec 2023 23:01:59 +0100 Subject: [PATCH 03/96] Fix video name deduplication --- CHANGELOG.md | 3 + PFERD/crawl/ilias/kit_ilias_web_crawler.py | 117 +++++++++++---------- 2 files changed, 65 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e902efa..0443d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Video name deduplication + ## 3.5.0 - 2023-09-13 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index b9fb45a..ac1f10d 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -140,6 +140,10 @@ def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]: return _iorepeat(1, name) +def _get_video_cache_key(element: IliasPageElement) -> str: + return f"ilias-video-cache-{element.id()}" + + # Crawler control flow: # # crawl_desktop -+ @@ -547,8 +551,8 @@ instance's greatest bottleneck. # Copy old mapping as it is likely still relevant if self.prev_report: self.report.add_custom_value( - str(element_path), - self.prev_report.get_custom_value(str(element_path)) + _get_video_cache_key(element), + self.prev_report.get_custom_value(_get_video_cache_key(element)) ) # A video might contain other videos, so let's "crawl" the video first @@ -558,58 +562,69 @@ instance's greatest bottleneck. # to ensure backwards compatibility. maybe_dl = await self.download(element_path, mtime=element.mtime, redownload=Redownload.ALWAYS) - # If we do not want to crawl it (user filter) or we have every file - # from the cached mapping already, we can ignore this and bail - if not maybe_dl or self._all_opencast_videos_locally_present(element_path): - # Mark all existing videos as known so they do not get deleted - # during cleanup. We "downloaded" them, just without actually making - # a network request as we assumed they did not change. - for video in self._previous_contained_opencast_videos(element_path): - await self.download(video) + # If we do not want to crawl it (user filter), we can move on + if not maybe_dl: + return None + + # If we have every file from the cached mapping already, we can ignore this and bail + if self._all_opencast_videos_locally_present(element, maybe_dl.path): + # Mark all existing videos as known to ensure they do not get deleted during cleanup. + # We "downloaded" them, just without actually making a network request as we assumed + # they did not change. + contained = self._previous_contained_opencast_videos(element, maybe_dl.path) + if len(contained) > 1: + # Only do this if we threw away the original dl token, + # to not download single-stream videos twice + for video in contained: + await self.download(video) return None - return self._download_opencast_video(element_path, element, maybe_dl) + return self._download_opencast_video(element, maybe_dl) - def _previous_contained_opencast_videos(self, video_path: PurePath) -> List[PurePath]: + def _previous_contained_opencast_videos( + self, element: IliasPageElement, element_path: PurePath + ) -> List[PurePath]: if not self.prev_report: return [] - custom_value = self.prev_report.get_custom_value(str(video_path)) + custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element)) if not custom_value: return [] - names = cast(List[str], custom_value) - folder = video_path.parent - return [PurePath(folder, name) for name in names] + cached_value = cast(dict[str, Any], custom_value) + if "known_paths" not in cached_value or "own_path" not in cached_value: + log.explain(f"'known_paths' or 'own_path' missing from cached value: {cached_value}") + return [] + transformed_own_path = self._transformer.transform(element_path) + if cached_value["own_path"] != str(transformed_own_path): + log.explain( + f"own_path '{transformed_own_path}' does not match cached value: '{cached_value['own_path']}" + ) + return [] + return [PurePath(name) for name in cached_value["known_paths"]] - def _all_opencast_videos_locally_present(self, video_path: PurePath) -> bool: - if contained_videos := self._previous_contained_opencast_videos(video_path): - log.explain_topic(f"Checking local cache for video {video_path.name}") - all_found_locally = True - for video in contained_videos: - transformed_path = self._to_local_opencast_video_path(video) - if transformed_path: - exists_locally = self._output_dir.resolve(transformed_path).exists() - all_found_locally = all_found_locally and exists_locally - if all_found_locally: - log.explain("Found all videos locally, skipping enumeration request") + def _all_opencast_videos_locally_present(self, element: IliasPageElement, element_path: PurePath) -> bool: + log.explain_topic(f"Checking local cache for video {fmt_path(element_path)}") + if contained_videos := self._previous_contained_opencast_videos(element, element_path): + log.explain( + f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}" + ) + if all(self._output_dir.resolve(path).exists() for path in contained_videos): + log.explain("Found all known videos locally, skipping enumeration request") return True log.explain("Missing at least one video, continuing with requests!") + else: + log.explain("No local cache present") return False - def _to_local_opencast_video_path(self, path: PurePath) -> Optional[PurePath]: - if transformed := self._transformer.transform(path): - return self._deduplicator.fixup_path(transformed) - return None - @anoncritical @_iorepeat(3, "downloading video") - async def _download_opencast_video( - self, - original_path: PurePath, - element: IliasPageElement, - dl: DownloadToken - ) -> None: - stream_elements: List[IliasPageElement] = [] + async def _download_opencast_video(self, element: IliasPageElement, dl: DownloadToken) -> None: + def add_to_report(paths: list[str]) -> None: + self.report.add_custom_value( + _get_video_cache_key(element), + {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))} + ) + async with dl as (bar, sink): page = IliasPage(await self._get_page(element.url), element.url, element) stream_elements = page.get_child_elements() @@ -620,32 +635,25 @@ instance's greatest bottleneck. log.explain(f"Using single video mode for {element.name}") stream_element = stream_elements[0] - transformed_path = self._to_local_opencast_video_path(original_path) - if not transformed_path: - raise CrawlError(f"Download returned a path but transform did not for {original_path}") - # We do not have a local cache yet - if self._output_dir.resolve(transformed_path).exists(): - log.explain(f"Video for {element.name} existed locally") - else: - await self._stream_from_url(stream_element.url, sink, bar, is_video=True) - self.report.add_custom_value(str(original_path), [original_path.name]) + await self._stream_from_url(stream_element.url, sink, bar, is_video=True) + add_to_report([str(self._transformer.transform(dl.path))]) return contained_video_paths: List[str] = [] for stream_element in stream_elements: - video_path = original_path.parent / stream_element.name - contained_video_paths.append(str(video_path)) + video_path = dl.path.parent / stream_element.name maybe_dl = await self.download(video_path, mtime=element.mtime, redownload=Redownload.NEVER) if not maybe_dl: continue async with maybe_dl as (bar, sink): log.explain(f"Streaming video from real url {stream_element.url}") + contained_video_paths.append(str(self._transformer.transform(maybe_dl.path))) await self._stream_from_url(stream_element.url, sink, bar, is_video=True) - self.report.add_custom_value(str(original_path), contained_video_paths) + add_to_report(contained_video_paths) async def _handle_file( self, @@ -657,8 +665,8 @@ instance's greatest bottleneck. return None return self._download_file(element, maybe_dl) - @anoncritical @_iorepeat(3, "downloading file") + @anoncritical async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): @@ -728,7 +736,6 @@ instance's greatest bottleneck. raise CrawlWarning("Failed to extract forum data") if download_data.empty: log.explain("Forum had no threads") - elements = [] return html = await self._post_authenticated(download_data.url, download_data.form_data) elements = parse_ilias_forum_export(soupify(html)) @@ -962,7 +969,7 @@ instance's greatest bottleneck. # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @ _iorepeat(3, "Login", failure_is_error=True) + @_iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) @@ -1112,7 +1119,7 @@ async def _shib_post( async with session.get(correct_url, allow_redirects=False) as response: location = response.headers.get("location") log.explain(f"Redirected to {location!r} with status {response.status}") - # If shib still still has a valid session, it will directly respond to the request + # If shib still has a valid session, it will directly respond to the request if location is None: log.explain("Shib recognized us, returning its response directly") return soupify(await response.read()) From ab0cb2d956129c51b67e4573da7c5e95372e9f5f Mon Sep 17 00:00:00 2001 From: TornaxO7 Date: Tue, 27 Feb 2024 23:39:53 +0100 Subject: [PATCH 04/96] nix: bump nixpgs dependency --- flake.lock | 8 ++++---- flake.nix | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flake.lock b/flake.lock index 1655107..6428667 100644 --- a/flake.lock +++ b/flake.lock @@ -2,16 +2,16 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1694499547, - "narHash": "sha256-R7xMz1Iia6JthWRHDn36s/E248WB1/je62ovC/dUVKI=", + "lastModified": 1708979614, + "narHash": "sha256-FWLWmYojIg6TeqxSnHkKpHu5SGnFP5um1uUjH+wRV6g=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e5f018cf150e29aac26c61dac0790ea023c46b24", + "rev": "b7ee09cf5614b02d289cd86fcfa6f24d4e078c2a", "type": "github" }, "original": { "owner": "NixOS", - "ref": "nixos-23.05", + "ref": "nixos-23.11", "repo": "nixpkgs", "type": "github" } diff --git a/flake.nix b/flake.nix index e3d52af..4fc47b2 100644 --- a/flake.nix +++ b/flake.nix @@ -2,7 +2,7 @@ description = "Tool for downloading course-related files from ILIAS"; inputs = { - nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; + nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11"; }; outputs = { self, nixpkgs }: From eb0c956d32b9181c46d0ca8ce4f5d3f871e2c1df Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 5 Apr 2024 19:06:54 +0200 Subject: [PATCH 05/96] Add compatibility with ILIAS 8 --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 50 ++++++++++++---------- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 +++++++-------- 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0443d50..df4fcf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Video name deduplication +- Compatibility with ILIAS 8 ## 3.5.0 - 2023-09-13 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d23141f..0be6448 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -17,7 +17,7 @@ TargetType = Union[str, int] class IliasElementType(Enum): EXERCISE = "exercise" EXERCISE_FILES = "exercise_files" # own submitted files - TEST = "test" # an online test. Will be ignored currently. + TEST = "test" # an online test. Will be ignored currently. FILE = "file" FOLDER = "folder" FORUM = "forum" @@ -95,13 +95,9 @@ class IliasPage: @staticmethod def is_root_page(soup: BeautifulSoup) -> bool: - permalink = soup.find(id="current_perma_link") - if permalink is None: - return False - value = permalink.attrs.get("value") - if value is None: - return False - return "goto.php?target=root_" in value + if permalink := IliasPage.get_soup_permalink(soup): + return "goto.php?target=root_" in permalink + return False def get_child_elements(self) -> List[IliasPageElement]: """ @@ -279,16 +275,14 @@ class IliasPage: return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) def _is_content_page(self) -> bool: - link = self._soup.find(id="current_perma_link") - if not link: - return False - return "target=copa_" in link.get("value") + if link := self.get_permalink(): + return "target=copa_" in link + return False def _is_learning_module_page(self) -> bool: - link = self._soup.find(id="current_perma_link") - if not link: - return False - return "target=pg_" in link.get("value") + if link := self.get_permalink(): + return "target=pg_" in link + return False def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -513,8 +507,8 @@ class IliasPage: modification_string = link.parent.parent.parent.select_one( f"td.std:nth-child({index})" ).getText().strip() - if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): - modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") + if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string): + modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M") break if modification_time is None: @@ -613,7 +607,7 @@ class IliasPage: file_listings: List[Tag] = container.findAll( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x} + attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()} ) # Add each listing as a new @@ -917,9 +911,9 @@ class IliasPage: @staticmethod def _find_type_from_link( - element_name: str, - link_element: Tag, - url: str + element_name: str, + link_element: Tag, + url: str ) -> Optional[IliasElementType]: """ Decides which sub crawler to use for a given top level element. @@ -1095,6 +1089,9 @@ class IliasPage: return True return False + def get_permalink(self) -> Optional[str]: + return IliasPage.get_soup_permalink(self._soup) + def _abs_url_from_link(self, link_tag: Tag) -> str: """ Create an absolute url from an tag. @@ -1107,6 +1104,13 @@ class IliasPage: """ return urljoin(self._page_url, relative_url) + @staticmethod + def get_soup_permalink(soup: BeautifulSoup) -> Optional[str]: + perma_link_element: Tag = soup.select_one(".il-footer-permanent-url > a") + if not perma_link_element or not perma_link_element.get("href"): + return None + return perma_link_element.get("href") + def _unexpected_html_warning() -> None: log.warn("Encountered unexpected HTML structure, ignoring element.") @@ -1130,7 +1134,7 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) - date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ac1f10d..52de793 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -12,17 +12,17 @@ import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag +from .file_templates import Links, learning_module_template +from .ilias_html_cleaner import clean, insert_base_markup +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) +from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical +from ..http_crawler import HttpCrawler, HttpCrawlerSection from ...auth import Authenticator, TfaAuthenticator from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param -from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical -from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links, learning_module_template -from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, - IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -130,6 +130,7 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla raise CrawlError("Impossible return in ilias _iorepeat") return wrapper # type: ignore + return decorator @@ -177,11 +178,11 @@ def _get_video_cache_key(element: IliasPageElement) -> str: class KitIliasWebCrawler(HttpCrawler): def __init__( - self, - name: str, - section: KitIliasWebCrawlerSection, - config: Config, - authenticators: Dict[str, Authenticator] + self, + name: str, + section: KitIliasWebCrawlerSection, + config: Config, + authenticators: Dict[str, Authenticator] ): # Setting a main authenticator for cookie sharing auth = section.auth(authenticators) @@ -253,8 +254,8 @@ instance's greatest bottleneck. soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): + perma_link = IliasPage.get_soup_permalink(soup) + if not perma_link or "crs_" not in perma_link: raise CrawlError("Invalid course id? Didn't find anything looking like a course") log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") @@ -677,7 +678,7 @@ instance's greatest bottleneck. async with self.session.get(url, allow_redirects=is_video) as resp: if not is_video: # Redirect means we weren't authenticated - if hdrs.LOCATION in resp.headers: + if hdrs.LOCATION in resp.headers and "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]: return False # we wanted a video but got HTML if is_video and "html" in resp.content_type: @@ -1052,9 +1053,9 @@ class KitShibbolethLogin: await sess.post(url, data=data) async def _authenticate_tfa( - self, - session: aiohttp.ClientSession, - soup: BeautifulSoup + self, + session: aiohttp.ClientSession, + soup: BeautifulSoup ) -> BeautifulSoup: if not self._tfa_auth: self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") From c1b592ac2930c1ced40dd7282ae8bca4d1b6109d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 8 Apr 2024 17:52:13 +0200 Subject: [PATCH 06/96] Fix ILIAS 8 file downloads truncating to zero bytes --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 40 +++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 52de793..7d6b309 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -12,17 +12,17 @@ import yarl from aiohttp import hdrs from bs4 import BeautifulSoup, Tag -from .file_templates import Links, learning_module_template -from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, - IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) -from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical -from ..http_crawler import HttpCrawler, HttpCrawlerSection from ...auth import Authenticator, TfaAuthenticator from ...config import Config from ...logging import ProgressBar, log from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param +from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical +from ..http_crawler import HttpCrawler, HttpCrawlerSection +from .file_templates import Links, learning_module_template +from .ilias_html_cleaner import clean, insert_base_markup +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -675,12 +675,28 @@ instance's greatest bottleneck. async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: - async with self.session.get(url, allow_redirects=is_video) as resp: - if not is_video: - # Redirect means we weren't authenticated - if hdrs.LOCATION in resp.headers and "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]: - return False - # we wanted a video but got HTML + next_url = url + + # Normal files redirect to the magazine if we are not authenticated. As files could be HTML, + # we can not match on the content type here. Instead, we disallow redirects and inspect the + # new location. If we are redirected anywhere but the ILIAS 8 "sendfile" command, we assume + # our authentication expired. + if not is_video: + async with self.session.get(url, allow_redirects=False) as resp: + # Redirect to anything except a "sendfile" means we weren't authenticated + if hdrs.LOCATION in resp.headers: + if "&cmd=sendfile" not in resp.headers[hdrs.LOCATION]: + return False + # Directly follow the redirect to not make a second, unnecessary request + next_url = resp.headers[hdrs.LOCATION] + + # Let's try this again and follow redirects + return await fetch_follow_redirects(next_url) + + async def fetch_follow_redirects(file_url: str) -> bool: + async with self.session.get(file_url) as resp: + # We wanted a video but got HTML => Forbidden, auth expired. Logging in won't really + # solve that depending on the setup, but it is better than nothing. if is_video and "html" in resp.content_type: return False From da627ff929abb3a1a3dff58ec46f29025e16c96b Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 9 Apr 2024 14:28:56 +0200 Subject: [PATCH 07/96] Bump version to 3.5.1 --- CHANGELOG.md | 6 +++++- PFERD/version.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df4fcf5..a76508e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,9 +22,13 @@ ambiguous situations. ## Unreleased +## 3.5.1 - 2024-04-09 + +### Added +- Support for ILIAS 8 + ### Fixed - Video name deduplication -- Compatibility with ILIAS 8 ## 3.5.0 - 2023-09-13 diff --git a/PFERD/version.py b/PFERD/version.py index 5ee464d..3f27494 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.5.0" +VERSION = "3.5.1" From 1cbc2b717a76751725f776483b611bd6b43525cf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 10 Apr 2024 01:01:59 +0200 Subject: [PATCH 08/96] Fix personal desktop crawling with ILIAS 8 --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a76508e..36768b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Crawling of personal desktop with ILIAS 8 + ## 3.5.1 - 2024-04-09 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 7d6b309..371ffb3 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -228,7 +228,7 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" + appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items" appendix = appendix.encode("ASCII").hex() await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) From 4a5959fd58d9c063ea9a37089d0aaa01c23544bc Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 10 Apr 2024 11:12:48 +0200 Subject: [PATCH 09/96] Fix personal desktop crawling without favorites --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36768b0..5212824 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Crawling of personal desktop with ILIAS 8 +- Crawling of empty personal desktops ## 3.5.1 - 2024-04-09 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 0be6448..aa00a87 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -1074,6 +1074,14 @@ class IliasPage: if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): return True + # Empty personal desktop has zero (0) markers. Match on the text... + if alert := soup.select_one(".alert-info"): + text = alert.getText().lower() + if "you have not yet selected any favourites" in text: + return True + if "sie haben aktuell noch keine favoriten ausgewählt" in text: + return True + # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find( From 3db186a9782e22cf1cd45b8d343b5cfa5124eb25 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 10 Apr 2024 11:12:55 +0200 Subject: [PATCH 10/96] Fix personal desktop crawling HTML warnings --- PFERD/crawl/ilias/kit_ilias_html.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aa00a87..4cfec9b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -378,6 +378,10 @@ class IliasPage: name = _sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link) + if "cmd=manage" in url and "cmdClass=ilPDSelectedItemsBlockGUI" in url: + # Configure button/link does not have anything interesting + continue + type = self._find_type_from_link(name, link, url) if not type: _unexpected_html_warning() From eb01aa86cbad96dd3a6dba86b92b73fdefd86eb0 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 14 Apr 2024 12:10:17 +0200 Subject: [PATCH 11/96] Bump version to 3.5.2 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5212824..e404d1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.5.2 - 2024-04-14 + ### Fixed - Crawling of personal desktop with ILIAS 8 - Crawling of empty personal desktops diff --git a/PFERD/version.py b/PFERD/version.py index 3f27494..47da4a6 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.5.1" +VERSION = "3.5.2" From bbcfe9c8dd5383463b4415d78e0a10ca8458b34d Mon Sep 17 00:00:00 2001 From: Florian Raith <37345813+florianraith@users.noreply.github.com> Date: Fri, 19 Apr 2024 16:52:18 +0200 Subject: [PATCH 12/96] Fix typo in CONFIG.md (#89) --- CONFIG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONFIG.md b/CONFIG.md index 5f62749..25496e0 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ crawler simulate a slower, network-based crawler. This crawler crawls a KIT-IPD page by url. The root page can be crawled from outside the KIT network so you will be informed about any new/deleted files, -but downloading files requires you to be within. Adding a show delay between +but downloading files requires you to be within. Adding a short delay between requests is likely a good idea. - `target`: URL to a KIT-IPD page From 3e831c7e23e9214e2cbbaf04709c153ee1fcb893 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 24 Apr 2024 22:32:26 +0200 Subject: [PATCH 13/96] Fix normalization of meeting names in cards --- CHANGELOG.md | 3 + PFERD/crawl/ilias/kit_ilias_html.py | 119 ++++++++++++++++------------ 2 files changed, 70 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e404d1d..f244a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Normalization of meeting names in cards + ## 3.5.2 - 2024-04-14 ### Fixed diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 4cfec9b..866f7c0 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -61,6 +61,47 @@ class IliasPageElement: log.warn(f"Didn't find identity for {self.name} - {self.url}. Please report this.") return self.url + @staticmethod + def create_new( + typ: IliasElementType, + url: str, + name: str, + mtime: Optional[datetime] = None, + description: Optional[str] = None + ) -> 'IliasPageElement': + if typ == IliasElementType.MEETING: + normalized = _sanitize_path_name(IliasPageElement._normalize_meeting_name(name)) + log.explain(f"Normalized meeting name from {name!r} to {normalized!r}") + name = normalized + return IliasPageElement(typ, url, name, mtime, description) + + @staticmethod + def _normalize_meeting_name(meeting_name: str) -> str: + """ + Normalizes meeting names, which have a relative time as their first part, + to their date in ISO format. + """ + + # This checks whether we can reach a `:` without passing a `-` + if re.search(r"^[^-]+: ", meeting_name): + # Meeting name only contains date: "05. Jan 2000:" + split_delimiter = ":" + else: + # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" + split_delimiter = ", " + + # We have a meeting day without time + date_portion_str = meeting_name.split(split_delimiter)[0] + date_portion = demangle_date(date_portion_str) + + # We failed to parse the date, bail out + if not date_portion: + return meeting_name + + # Replace the first section with the absolute date + rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) + return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name + @dataclass class IliasDownloadForumData: @@ -130,7 +171,7 @@ class IliasPage: attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x} ) if tab is not None: - return IliasPageElement( + return IliasPageElement.create_new( IliasElementType.INFO_TAB, self._abs_url_from_link(tab), "infos" @@ -295,7 +336,7 @@ class IliasPage: if not element: return None link = self._abs_url_from_link(element) - return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + return IliasPageElement.create_new(IliasElementType.FOLDER, link, "show all meetings") def _is_content_tab_selected(self) -> bool: return self._select_content_page_url() is None @@ -315,7 +356,7 @@ class IliasPage: link = tab.find("a") if link: link = self._abs_url_from_link(link) - return IliasPageElement(IliasElementType.FOLDER, link, "select content page") + return IliasPageElement.create_new(IliasElementType.FOLDER, link, "select content page") _unexpected_html_warning() log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") @@ -345,14 +386,16 @@ class IliasPage: # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] + return [ + IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name) + ] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] - items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) + items.append(IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) return items @@ -367,7 +410,7 @@ class IliasPage: link = self._abs_url_from_link(correct_link) - return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") + return IliasPageElement.create_new(IliasElementType.FORUM, link, "show all forum threads") def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] @@ -394,7 +437,7 @@ class IliasPage: url = re.sub(r"(target=file_\d+)", r"\1_download", url) log.explain("Rewired file URL to include download part") - items.append(IliasPageElement(type, url, name)) + items.append(IliasPageElement.create_new(type, url, name)) return items @@ -412,7 +455,7 @@ class IliasPage: log.warn_contd(f"Found unknown content page item {name!r} with url {url!r}") continue - items.append(IliasPageElement(IliasElementType.FILE, url, name)) + items.append(IliasPageElement.create_new(IliasElementType.FILE, url, name)) return items @@ -425,7 +468,7 @@ class IliasPage: continue if "cmd=sendfile" not in link["href"]: continue - items.append(IliasPageElement( + items.append(IliasPageElement.create_new( IliasElementType.FILE, self._abs_url_from_link(link), _sanitize_path_name(link.getText()) @@ -453,7 +496,9 @@ class IliasPage: query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) log.explain("Found ILIAS video frame page, fetching actual content next") - return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + return [ + IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "") + ] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None @@ -482,7 +527,7 @@ class IliasPage: url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") - return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] + return [IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: """ @@ -527,7 +572,7 @@ class IliasPage: video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") - return IliasPageElement( + return IliasPageElement.create_new( IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time ) @@ -563,7 +608,7 @@ class IliasPage: if date is None: log.warn(f"Date parsing failed for exercise entry {name!r}") - results.append(IliasPageElement( + results.append(IliasPageElement.create_new( IliasElementType.FILE, self._abs_url_from_link(link), name, @@ -600,7 +645,7 @@ class IliasPage: url = self._abs_url_from_link(file_link) log.explain(f"Found exercise entry {file_name!r}") - results.append(IliasPageElement( + results.append(IliasPageElement.create_new( IliasElementType.FILE, url, container_name + "/" + file_name, @@ -625,7 +670,7 @@ class IliasPage: file_name = _sanitize_path_name(label_container.getText().strip()) url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") - results.append(IliasPageElement( + results.append(IliasPageElement.create_new( IliasElementType.EXERCISE_FILES, url, container_name + "/" + file_name, @@ -660,16 +705,13 @@ class IliasPage: if not element_type: continue - if element_type == IliasElementType.MEETING: - normalized = _sanitize_path_name(self._normalize_meeting_name(element_name)) - log.explain(f"Normalized meeting name from {element_name!r} to {normalized!r}") - element_name = normalized elif element_type == IliasElementType.FILE: result.append(self._file_to_element(element_name, abs_url, link)) continue log.explain(f"Found {element_name!r}") - result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) + result.append(IliasPageElement.create_new( + element_type, abs_url, element_name, description=description)) result += self._find_cards() result += self._find_mediacast_videos() @@ -692,8 +734,8 @@ class IliasPage: log.warn_contd(f"No

heading + drop_h1: bool = len(page.find_all(name="h1")) <= 1 + + folder_tree: KitIpdFolder = KitIpdFolder(".", []) for element in elements: - folder_label = self._find_folder_label(element) - if folder_label: - folder = self._extract_folder(folder_label, url) - if folder not in items: - items.add(folder) - folder.explain() - else: - file = self._extract_file(element, url) - items.add(file) - log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") - log.explain("Attributing it to root folder") + parent = HttpCrawler.get_folder_structure_from_heading_hierarchy(element, drop_h1) + file = self._extract_file(element, url) - return items + current_folder: KitIpdFolder = folder_tree + for folder_name in parent.parts: + # helps the type checker to verify that current_folder is indeed a folder + def subfolders() -> Generator[KitIpdFolder, Any, None]: + return (entry for entry in current_folder.entries if isinstance(entry, KitIpdFolder)) - def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: - files: List[KitIpdFile] = [] - name = folder_tag.getText().strip() + if not any(entry.name == folder_name for entry in subfolders()): + current_folder.entries.append(KitIpdFolder(folder_name, [])) + current_folder = next(entry for entry in subfolders() if entry.name == folder_name) - container: Tag = folder_tag.findNextSibling(name="table") - for link in self._find_file_links(container): - files.append(self._extract_file(link, url)) + current_folder.entries.append(file) - return KitIpdFolder(name, files) - - @staticmethod - def _find_folder_label(file_link: Tag) -> Optional[Tag]: - enclosing_table: Tag = file_link.findParent(name="table") - if enclosing_table is None: - return None - return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) + return folder_tree.entries def _extract_file(self, link: Tag, url: str) -> KitIpdFile: url = self._abs_url_from_link(url, link) From 596b6a7688a5101ec6e44a13f602c4673eb5e8e0 Mon Sep 17 00:00:00 2001 From: PinieP <59698589+PinieP@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:30:34 +0100 Subject: [PATCH 50/96] Add support for non-KIT shibboleth login (#98) Co-authored-by: Mr-Pine Co-authored-by: I-Al-Istannen --- CHANGELOG.md | 1 + CONFIG.md | 21 ++- LICENSE | 2 +- PFERD/crawl/http_crawler.py | 7 +- PFERD/crawl/ilias/async_helper.py | 3 +- PFERD/crawl/ilias/ilias_web_crawler.py | 98 ++++++---- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 210 +-------------------- PFERD/crawl/ilias/shibboleth_login.py | 128 +++++++++++++ 8 files changed, 226 insertions(+), 244 deletions(-) create mode 100644 PFERD/crawl/ilias/shibboleth_login.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 12cda26..8024bba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - Support for MOB videos in page descriptions - Clickable links in the report to directly open new/modified/not-deleted files +- Support for non KIT shibboleth login ### Changed - Remove videos from description pages diff --git a/CONFIG.md b/CONFIG.md index a52506d..9b79be8 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -163,12 +163,13 @@ out of the box for the corresponding universities: [ilias-dl]: https://github.com/V3lop5/ilias-downloader/blob/main/configs "ilias-downloader configs" -| University | `base_url` | `client_id` | -|---------------|--------------------------------------|---------------| -| FH Aachen | https://www.ili.fh-aachen.de | elearning | -| Uni Köln | https://www.ilias.uni-koeln.de/ilias | uk | -| Uni Konstanz | https://ilias.uni-konstanz.de | ILIASKONSTANZ | -| Uni Stuttgart | https://ilias3.uni-stuttgart.de | Uni_Stuttgart | +| University | `base_url` | `login_type` | `client_id` | +|---------------|-----------------------------------------|--------------|---------------| +| FH Aachen | https://www.ili.fh-aachen.de | local | elearning | +| Uni Köln | https://www.ilias.uni-koeln.de/ilias | local | uk | +| Uni Konstanz | https://ilias.uni-konstanz.de | local | ILIASKONSTANZ | +| Uni Stuttgart | https://ilias3.uni-stuttgart.de | local | Uni_Stuttgart | +| Uni Tübingen | https://ovidius.uni-tuebingen.de/ilias3 | shibboleth | | If your university isn't listed, try navigating to your instance's login page. Assuming no custom login service is used, the URL will look something like this: @@ -180,7 +181,11 @@ Assuming no custom login service is used, the URL will look something like this: If the values work, feel free to submit a PR and add them to the table above. - `base_url`: The URL where the ILIAS instance is located. (Required) -- `client_id`: An ID used for authentication. (Required) +- `login_type`: How you authenticate. (Required) + - `local`: Use `client_id` for authentication. + - `shibboleth`: Use shibboleth for authentication. +- `client_id`: An ID used for authentication if `login_type` is `local`. Is + ignored if `login_type` is `shibboleth`. - `target`: The ILIAS element to crawl. (Required) - `desktop`: Crawl your personal desktop / dashboard - ``: Crawl the course with the given id @@ -191,6 +196,8 @@ If the values work, feel free to submit a PR and add them to the table above. and duplication warnings if you are a member of an ILIAS group. The `desktop` target is generally preferable. - `auth`: Name of auth section to use for login. (Required) +- `tfa_auth`: Name of auth section to use for two-factor authentication. Only + uses the auth section's password. (Default: Anonymous `tfa` authenticator) - `links`: How to represent external links. (Default: `fancy`) - `ignore`: Don't download links. - `plaintext`: A text file containing only the URL. diff --git a/LICENSE b/LICENSE index 13fa307..ccccbe3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ Copyright 2019-2024 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, TheChristophe, Scriptim, thelukasprobst, Toorero, - Mr-Pine, p-fruck + Mr-Pine, p-fruck, PinieP Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index fe8a360..2cc97e1 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -262,7 +262,12 @@ class HttpCrawler(Crawler): connect=self._http_timeout, sock_connect=self._http_timeout, sock_read=self._http_timeout, - ) + ), + # See https://github.com/aio-libs/aiohttp/issues/6626 + # Without this aiohttp will mangle the redirect header from Shibboleth, invalidating the + # passed signature. Shibboleth will not accept the broken signature and authentication will + # fail. + requote_redirect_url=False ) as session: self.session = session try: diff --git a/PFERD/crawl/ilias/async_helper.py b/PFERD/crawl/ilias/async_helper.py index 527a819..5e586b1 100644 --- a/PFERD/crawl/ilias/async_helper.py +++ b/PFERD/crawl/ilias/async_helper.py @@ -25,9 +25,10 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla except asyncio.exceptions.TimeoutError as e: # explicit http timeouts in HttpCrawler last_exception = e log.explain_topic(f"Retrying operation {name}. Retries left: {attempts - 1 - round}") + log.explain(f"Last exception: {last_exception!r}") if last_exception: - message = f"Error in I/O Operation: {last_exception}" + message = f"Error in I/O Operation: {last_exception!r}" if failure_is_error: raise CrawlError(message) from last_exception else: diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 941b265..a6c68f1 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -23,10 +23,16 @@ from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) +from .shibboleth_login import ShibbolethLogin TargetType = Union[str, int] +class LoginTypeLocal: + def __init__(self, client_id: str): + self.client_id = client_id + + class IliasWebCrawlerSection(HttpCrawlerSection): def base_url(self) -> str: base_url = self.s.get("base_url") @@ -35,12 +41,30 @@ class IliasWebCrawlerSection(HttpCrawlerSection): return base_url - def client_id(self) -> str: - client_id = self.s.get("client_id") - if not client_id: - self.missing_value("client_id") + def login(self) -> Union[Literal["shibboleth"], LoginTypeLocal]: + login_type = self.s.get("login_type") + if not login_type: + self.missing_value("login_type") + if login_type == "shibboleth": + return "shibboleth" + if login_type == "local": + client_id = self.s.get("client_id") + if not client_id: + self.missing_value("client_id") + return LoginTypeLocal(client_id) - return client_id + self.invalid_value("login_type", login_type, "Should be ") + + def tfa_auth( + self, authenticators: Dict[str, Authenticator] + ) -> Optional[Authenticator]: + value: Optional[str] = self.s.get("tfa_auth") + if value is None: + return None + auth = authenticators.get(value) + if auth is None: + self.invalid_value("tfa_auth", value, "No such auth section exists") + return auth def target(self) -> TargetType: target = self.s.get("target") @@ -156,7 +180,13 @@ instance's greatest bottleneck. self._auth = auth self._base_url = section.base_url() - self._client_id = section.client_id() + self._tfa_auth = section.tfa_auth(authenticators) + + self._login_type = section.login() + if isinstance(self._login_type, LoginTypeLocal): + self._client_id = self._login_type.client_id + else: + self._shibboleth_login = ShibbolethLogin(self._base_url, self._auth, self._tfa_auth) self._target = section.target() self._link_file_redirect_delay = section.link_redirect_delay() @@ -179,7 +209,7 @@ instance's greatest bottleneck. async def _crawl_course(self, course_id: int) -> None: # Start crawling at the given course root_url = url_set_query_param( - urljoin(self._base_url, "/goto.php"), + urljoin(self._base_url + "/", "goto.php"), "target", f"crs_{course_id}", ) @@ -460,11 +490,12 @@ instance's greatest bottleneck. return "" return None + auth_id = await self._current_auth_id() target = await impl() if target is not None: return target - await self._authenticate() + await self.authenticate(auth_id) target = await impl() if target is not None: @@ -935,38 +966,39 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("get_authenticated failed even after authenticating") - # ToDo: Is iorepeat still required? - @_iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: # fill the session with the correct cookies - params = { - "client_id": self._client_id, - "cmd": "force_login", - } - async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: - login_page = soupify(await request.read()) + if self._login_type == "shibboleth": + await self._shibboleth_login.login(self.session) + else: + params = { + "client_id": self._client_id, + "cmd": "force_login", + } + async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: + login_page = soupify(await request.read()) - login_form = login_page.find("form", attrs={"name": "formlogin"}) - if login_form is None: - raise CrawlError("Could not find the login form! Specified client id might be invalid.") + login_form = login_page.find("form", attrs={"name": "formlogin"}) + if login_form is None: + raise CrawlError("Could not find the login form! Specified client id might be invalid.") - login_url = login_form.attrs.get("action") - if login_url is None: - raise CrawlError("Could not find the action URL in the login form!") + login_url = login_form.attrs.get("action") + if login_url is None: + raise CrawlError("Could not find the action URL in the login form!") - username, password = await self._auth.credentials() + username, password = await self._auth.credentials() - login_data = { - "username": username, - "password": password, - "cmd[doStandardAuthentication]": "Login", - } + login_data = { + "username": username, + "password": password, + "cmd[doStandardAuthentication]": "Login", + } - # do the actual login - async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: - soup = soupify(await request.read()) - if not self._is_logged_in(soup): - self._auth.invalidate_credentials() + # do the actual login + async with self.session.post(urljoin(self._base_url, login_url), data=login_data) as request: + soup = soupify(await request.read()) + if not self._is_logged_in(soup): + self._auth.invalidate_credentials() @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 558221d..fc1d58f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,23 +1,14 @@ -from typing import Any, Dict, Optional, Union +from typing import Dict, Literal -import aiohttp -import yarl -from bs4 import BeautifulSoup - -from ...auth import Authenticator, TfaAuthenticator +from ...auth import Authenticator from ...config import Config -from ...logging import log -from ...utils import soupify -from ..crawler import CrawlError, CrawlWarning -from .async_helper import _iorepeat from .ilias_web_crawler import IliasWebCrawler, IliasWebCrawlerSection - -TargetType = Union[str, int] +from .shibboleth_login import ShibbolethLogin _ILIAS_URL = "https://ilias.studium.kit.edu" -class KitShibbolethBackgroundLoginSuccessful(): +class KitShibbolethBackgroundLoginSuccessful: pass @@ -25,19 +16,8 @@ class KitIliasWebCrawlerSection(IliasWebCrawlerSection): def base_url(self) -> str: return _ILIAS_URL - def client_id(self) -> str: - # KIT ILIAS uses the Shibboleth service for authentication. There's no - # use for a client id. - return "unused" - - def tfa_auth(self, authenticators: Dict[str, Authenticator]) -> Optional[Authenticator]: - value: Optional[str] = self.s.get("tfa_auth") - if value is None: - return None - auth = authenticators.get(value) - if auth is None: - self.invalid_value("tfa_auth", value, "No such auth section exists") - return auth + def login(self) -> Literal["shibboleth"]: + return "shibboleth" class KitIliasWebCrawler(IliasWebCrawler): @@ -46,184 +26,12 @@ class KitIliasWebCrawler(IliasWebCrawler): name: str, section: KitIliasWebCrawlerSection, config: Config, - authenticators: Dict[str, Authenticator] + authenticators: Dict[str, Authenticator], ): super().__init__(name, section, config, authenticators) - self._shibboleth_login = KitShibbolethLogin( + self._shibboleth_login = ShibbolethLogin( + _ILIAS_URL, self._auth, section.tfa_auth(authenticators), ) - - # We repeat this as the login method in shibboleth doesn't handle I/O errors. - # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login", failure_is_error=True) - async def _authenticate(self) -> None: - await self._shibboleth_login.login(self.session) - - -class KitShibbolethLogin: - """ - Login via KIT's shibboleth system. - """ - - def __init__(self, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator]) -> None: - self._auth = authenticator - self._tfa_auth = tfa_authenticator - - async def login(self, sess: aiohttp.ClientSession) -> None: - """ - Performs the ILIAS Shibboleth authentication dance and saves the login - cookies it receieves. - - This function should only be called whenever it is detected that you're - not logged in. The cookies obtained should be good for a few minutes, - maybe even an hour or two. - """ - - # Equivalent: Click on "Mit KIT-Account anmelden" button in - # https://ilias.studium.kit.edu/login.php - url = f"{_ILIAS_URL}/shib_login.php" - data = { - "sendLogin": "1", - "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", - "il_target": "", - "home_organization_selection": "Weiter", - } - soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) - - if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): - return - - # Attempt to login using credentials, if necessary - while not self._login_successful(soup): - # Searching the form here so that this fails before asking for - # credentials rather than after asking. - form = soup.find("form", {"class": "full content", "method": "post"}) - action = form["action"] - - csrf_token = form.find("input", {"name": "csrf_token"})["value"] - - # Equivalent: Enter credentials in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - url = "https://idp.scc.kit.edu" + action - username, password = await self._auth.credentials() - data = { - "_eventId_proceed": "", - "j_username": username, - "j_password": password, - "csrf_token": csrf_token - } - soup = await _post(sess, url, data) - - if soup.find(id="attributeRelease"): - raise CrawlError( - "ILIAS Shibboleth entitlements changed! " - "Please log in once in your browser and review them" - ) - - if self._tfa_required(soup): - soup = await self._authenticate_tfa(sess, soup) - - if not self._login_successful(soup): - self._auth.invalidate_credentials() - - # Equivalent: Being redirected via JS automatically - # (or clicking "Continue" if you have JS disabled) - relay_state = soup.find("input", {"name": "RelayState"}) - saml_response = soup.find("input", {"name": "SAMLResponse"}) - url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" - data = { # using the info obtained in the while loop above - "RelayState": relay_state["value"], - "SAMLResponse": saml_response["value"], - } - await sess.post(url, data=data) - - async def _authenticate_tfa( - self, - session: aiohttp.ClientSession, - soup: BeautifulSoup - ) -> BeautifulSoup: - if not self._tfa_auth: - self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") - - tfa_token = await self._tfa_auth.password() - - # Searching the form here so that this fails before asking for - # credentials rather than after asking. - form = soup.find("form", {"method": "post"}) - action = form["action"] - csrf_token = form.find("input", {"name": "csrf_token"})["value"] - - # Equivalent: Enter token in - # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO - url = "https://idp.scc.kit.edu" + action - data = { - "_eventId_proceed": "", - "j_tokenNumber": tfa_token, - "csrf_token": csrf_token - } - return await _post(session, url, data) - - @staticmethod - def _login_successful(soup: BeautifulSoup) -> bool: - relay_state = soup.find("input", {"name": "RelayState"}) - saml_response = soup.find("input", {"name": "SAMLResponse"}) - return relay_state is not None and saml_response is not None - - @staticmethod - def _tfa_required(soup: BeautifulSoup) -> bool: - return soup.find(id="j_tokenNumber") is not None - - -async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: - async with session.post(url, data=data) as response: - return soupify(await response.read()) - - -async def _shib_post( - session: aiohttp.ClientSession, - url: str, - data: Any -) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: - """ - aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected - by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and - build encoded URL objects ourselves... Who thought mangling location header was a good idea?? - """ - log.explain_topic("Shib login POST") - async with session.post(url, data=data, allow_redirects=False) as response: - location = response.headers.get("location") - log.explain(f"Got location {location!r}") - if not location: - raise CrawlWarning(f"Login failed (1), no location header present at {url}") - correct_url = yarl.URL(location, encoded=True) - log.explain(f"Corrected location to {correct_url!r}") - - if str(correct_url).startswith(_ILIAS_URL): - log.explain("ILIAS recognized our shib token and logged us in in the background, returning") - return KitShibbolethBackgroundLoginSuccessful() - - async with session.get(correct_url, allow_redirects=False) as response: - location = response.headers.get("location") - log.explain(f"Redirected to {location!r} with status {response.status}") - # If shib still has a valid session, it will directly respond to the request - if location is None: - log.explain("Shib recognized us, returning its response directly") - return soupify(await response.read()) - - as_yarl = yarl.URL(response.url) - # Probably not needed anymore, but might catch a few weird situations with a nicer message - if not location or not as_yarl.host: - raise CrawlWarning(f"Login failed (2), no location header present at {correct_url}") - - correct_url = yarl.URL.build( - scheme=as_yarl.scheme, - host=as_yarl.host, - path=location, - encoded=True - ) - log.explain(f"Corrected location to {correct_url!r}") - - async with session.get(correct_url, allow_redirects=False) as response: - return soupify(await response.read()) diff --git a/PFERD/crawl/ilias/shibboleth_login.py b/PFERD/crawl/ilias/shibboleth_login.py new file mode 100644 index 0000000..d57820e --- /dev/null +++ b/PFERD/crawl/ilias/shibboleth_login.py @@ -0,0 +1,128 @@ +from typing import Any, Optional + +import aiohttp +import yarl +from bs4 import BeautifulSoup + +from ...auth import Authenticator, TfaAuthenticator +from ...logging import log +from ...utils import soupify +from ..crawler import CrawlError + + +class ShibbolethLogin: + """ + Login via shibboleth system. + """ + + def __init__( + self, ilias_url: str, authenticator: Authenticator, tfa_authenticator: Optional[Authenticator] + ) -> None: + self._ilias_url = ilias_url + self._auth = authenticator + self._tfa_auth = tfa_authenticator + + async def login(self, sess: aiohttp.ClientSession) -> None: + """ + Performs the ILIAS Shibboleth authentication dance and saves the login + cookies it receieves. + + This function should only be called whenever it is detected that you're + not logged in. The cookies obtained should be good for a few minutes, + maybe even an hour or two. + """ + + # Equivalent: Click on "Mit KIT-Account anmelden" button in + # https://ilias.studium.kit.edu/login.php + url = f"{self._ilias_url}/shib_login.php" + async with sess.get(url) as response: + shib_url = response.url + if str(shib_url).startswith(self._ilias_url): + log.explain( + "ILIAS recognized our shib token and logged us in in the background, returning" + ) + return + soup: BeautifulSoup = soupify(await response.read()) + + # Attempt to login using credentials, if necessary + while not self._login_successful(soup): + # Searching the form here so that this fails before asking for + # credentials rather than after asking. + form = soup.find("form", {"method": "post"}) + action = form["action"] + + # Equivalent: Enter credentials in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = str(shib_url.origin()) + action + username, password = await self._auth.credentials() + data = { + "_eventId_proceed": "", + "j_username": username, + "j_password": password, + } + if csrf_token_input := form.find("input", {"name": "csrf_token"}): + data["csrf_token"] = csrf_token_input["value"] + soup = await _post(sess, url, data) + + if soup.find(id="attributeRelease"): + raise CrawlError( + "ILIAS Shibboleth entitlements changed! " + "Please log in once in your browser and review them" + ) + + if self._tfa_required(soup): + soup = await self._authenticate_tfa(sess, soup, shib_url) + + if not self._login_successful(soup): + self._auth.invalidate_credentials() + + # Equivalent: Being redirected via JS automatically + # (or clicking "Continue" if you have JS disabled) + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + url = form = soup.find("form", {"method": "post"})["action"] + data = { # using the info obtained in the while loop above + "RelayState": relay_state["value"], + "SAMLResponse": saml_response["value"], + } + await sess.post(url, data=data) + + async def _authenticate_tfa( + self, session: aiohttp.ClientSession, soup: BeautifulSoup, shib_url: yarl.URL + ) -> BeautifulSoup: + if not self._tfa_auth: + self._tfa_auth = TfaAuthenticator("ilias-anon-tfa") + + tfa_token = await self._tfa_auth.password() + + # Searching the form here so that this fails before asking for + # credentials rather than after asking. + form = soup.find("form", {"method": "post"}) + action = form["action"] + + # Equivalent: Enter token in + # https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO + url = str(shib_url.origin()) + action + username, password = await self._auth.credentials() + data = { + "_eventId_proceed": "", + "j_tokenNumber": tfa_token, + } + if csrf_token_input := form.find("input", {"name": "csrf_token"}): + data["csrf_token"] = csrf_token_input["value"] + return await _post(session, url, data) + + @staticmethod + def _login_successful(soup: BeautifulSoup) -> bool: + relay_state = soup.find("input", {"name": "RelayState"}) + saml_response = soup.find("input", {"name": "SAMLResponse"}) + return relay_state is not None and saml_response is not None + + @staticmethod + def _tfa_required(soup: BeautifulSoup) -> bool: + return soup.find(id="j_tokenNumber") is not None + + +async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: + async with session.post(url, data=data) as response: + return soupify(await response.read()) From 6dda4c55a8bdd0afba9126f39e7402df7dc59479 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 5 Nov 2024 18:36:21 +0100 Subject: [PATCH 51/96] Add doctype header to forum threads This should fix mimetype detection on most systems and is more relevant now that the report is clickable --- CHANGELOG.md | 1 + PFERD/crawl/ilias/ilias_web_crawler.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8024bba..5206b20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ ambiguous situations. - Crawling of nested courses - Downloading of links with no target URL - Handle row flex on description pages +- Add `` heading to forum threads to fix mime type detection ## 3.6.0 - 2024-10-23 diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index a6c68f1..2fc399d 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -750,7 +750,8 @@ instance's greatest bottleneck. return async with maybe_dl as (bar, sink): - content = element.title_tag.prettify() + content = "\n" + content += element.title_tag.prettify() content += element.content_tag.prettify() sink.file.write(content.encode("utf-8")) sink.done() From 712217e95962a383ee95c58fd85c61980ef1fc14 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 11 Nov 2024 12:52:55 +0100 Subject: [PATCH 52/96] Handle groups in cards --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5206b20..095442d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ ambiguous situations. - Downloading of links with no target URL - Handle row flex on description pages - Add `` heading to forum threads to fix mime type detection +- Handle groups in cards ## 3.6.0 - 2024-10-23 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index e0c87ad..57c81e5 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -998,6 +998,8 @@ class IliasPage: return IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE + if "grp" in icon["class"]: + return IliasElementType.FOLDER if "webr" in icon["class"]: return IliasElementType.LINK if "book" in icon["class"]: From 287173b0b114f708cb34db4f3fef247962fccc3d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 13 Nov 2024 20:38:27 +0100 Subject: [PATCH 53/96] Bump version to 3.7.0 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 095442d..e18f88a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.7.0 - 2024-11-13 + ### Added - Support for MOB videos in page descriptions - Clickable links in the report to directly open new/modified/not-deleted files diff --git a/PFERD/version.py b/PFERD/version.py index 0bf695b..21118d3 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.6.0" +VERSION = "3.7.0" From 678283d341294d3fefe69242d8f8b87d58a2b5c0 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 14 Nov 2024 20:06:13 +0100 Subject: [PATCH 54/96] Use Python facilities to convert paths to file:// urls --- CHANGELOG.md | 3 +++ PFERD/pferd.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e18f88a..bbd2dd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +## Fixed +- File links in report on Windows + ## 3.7.0 - 2024-11-13 ### Added diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 850e68e..ca2e5b7 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,6 +1,5 @@ from pathlib import Path, PurePath from typing import Dict, List, Optional -from urllib.parse import quote from rich.markup import escape @@ -171,7 +170,7 @@ class Pferd: def fmt_path_link(relative_path: PurePath) -> str: # We need to URL-encode the path because it might contain spaces or special characters - link = f"file://{quote(str(crawler.output_dir.resolve(relative_path).absolute()))}" + link = crawler.output_dir.resolve(relative_path).absolute().as_uri() return f"[link={link}]{fmt_path(relative_path)}[/link]" something_changed = False From 16a2dd5b15561f91134bc2a3b31a92483921e021 Mon Sep 17 00:00:00 2001 From: Aurelia Date: Wed, 12 Feb 2025 21:48:05 +0100 Subject: [PATCH 55/96] fix: totp --- CHANGELOG.md | 1 + PFERD/crawl/ilias/shibboleth_login.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbd2dd6..2ff98bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ## Fixed - File links in report on Windows +- TOTP authentication in KIT Shibboleth ## 3.7.0 - 2024-11-13 diff --git a/PFERD/crawl/ilias/shibboleth_login.py b/PFERD/crawl/ilias/shibboleth_login.py index d57820e..ab59f25 100644 --- a/PFERD/crawl/ilias/shibboleth_login.py +++ b/PFERD/crawl/ilias/shibboleth_login.py @@ -59,6 +59,7 @@ class ShibbolethLogin: "_eventId_proceed": "", "j_username": username, "j_password": password, + "fudis_web_authn_assertion_input": "", } if csrf_token_input := form.find("input", {"name": "csrf_token"}): data["csrf_token"] = csrf_token_input["value"] @@ -106,7 +107,7 @@ class ShibbolethLogin: username, password = await self._auth.credentials() data = { "_eventId_proceed": "", - "j_tokenNumber": tfa_token, + "fudis_otp_input": tfa_token, } if csrf_token_input := form.find("input", {"name": "csrf_token"}): data["csrf_token"] = csrf_token_input["value"] @@ -120,7 +121,7 @@ class ShibbolethLogin: @staticmethod def _tfa_required(soup: BeautifulSoup) -> bool: - return soup.find(id="j_tokenNumber") is not None + return soup.find(id="fudiscr-form") is not None async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: From bd9d7efe646b63f607dc1c2b5c23c6e9b5bd0466 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 12 Feb 2025 22:41:43 +0100 Subject: [PATCH 56/96] "Fix" mypy errors Thank you mypy, very cool. These types make things *so much better*. They don't just complicate everything and don't really help because they can not detect that an element queried by a tag is no navigable string... --- PFERD/auth/keyring.py | 4 +- PFERD/crawl/http_crawler.py | 7 +- PFERD/crawl/ilias/file_templates.py | 10 +- PFERD/crawl/ilias/ilias_html_cleaner.py | 14 +- PFERD/crawl/ilias/ilias_web_crawler.py | 34 +-- PFERD/crawl/ilias/kit_ilias_html.py | 323 ++++++++++++------------ PFERD/crawl/ilias/shibboleth_login.py | 28 +- PFERD/crawl/kit_ipd_crawler.py | 8 +- 8 files changed, 224 insertions(+), 204 deletions(-) diff --git a/PFERD/auth/keyring.py b/PFERD/auth/keyring.py index c14f6fb..02a9269 100644 --- a/PFERD/auth/keyring.py +++ b/PFERD/auth/keyring.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import Optional, Tuple, cast import keyring @@ -13,7 +13,7 @@ class KeyringAuthSection(AuthSection): return self.s.get("username") def keyring_name(self) -> str: - return self.s.get("keyring_name", fallback=NAME) + return cast(str, self.s.get("keyring_name", fallback=NAME)) class KeyringAuthenticator(Authenticator): diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 2cc97e1..1c4631c 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -3,7 +3,7 @@ import http.cookies import ssl from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import aiohttp import certifi @@ -187,12 +187,12 @@ class HttpCrawler(Crawler): if level == 0 or (level == 1 and drop_h1): return PurePath() - level_heading = tag.find_previous(name=f"h{level}") + level_heading = cast(Optional[Tag], tag.find_previous(name=f"h{level}")) if level_heading is None: return find_associated_headings(tag, level - 1) - folder_name = level_heading.getText().strip() + folder_name = level_heading.get_text().strip() return find_associated_headings(level_heading, level - 1) / folder_name # start at level

because paragraph-level headings are usually too granular for folder names @@ -231,6 +231,7 @@ class HttpCrawler(Crawler): etag_header = resp.headers.get("ETag") last_modified_header = resp.headers.get("Last-Modified") + last_modified = None if last_modified_header: try: diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index b206461..0a72199 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Optional +from typing import Optional, cast import bs4 @@ -139,13 +139,13 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next """ if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): - text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + text = cast(bs4.Tag, body.select_one(".ilc_page_lnav_LeftNavigation")).get_text().strip() left = f'{text}' else: left = "" if next and body.select_one(".ilc_page_rnav_RightNavigation"): - text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + text = cast(bs4.Tag, body.select_one(".ilc_page_rnav_RightNavigation")).get_text().strip() right = f'{text}' else: right = "" @@ -160,8 +160,8 @@ def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next "{{left}}", left).replace("{{right}}", right).encode()) ) - body = body.prettify() - return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + body_str = cast(str, body.prettify()) + return _learning_module_template.replace("{{body}}", body_str).replace("{{name}}", name) class Links(Enum): diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index e82906f..fb35bc0 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -1,3 +1,5 @@ +from typing import cast + from bs4 import BeautifulSoup, Comment, Tag _STYLE_TAG_CONTENT = """ @@ -70,18 +72,18 @@ def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: def clean(soup: BeautifulSoup) -> BeautifulSoup: - for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + for block in cast(list[Tag], soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES)): block.name = "article" - for block in soup.find_all("h3"): + for block in cast(list[Tag], soup.find_all("h3")): block.name = "div" - for block in soup.find_all("h1"): + for block in cast(list[Tag], soup.find_all("h1")): block.name = "h3" - for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + for block in cast(list[Tag], soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap")): block.name = "h3" - block["class"] += ["accordion-head"] + block["class"] += ["accordion-head"] # type: ignore for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): children = list(dummy.children) @@ -97,7 +99,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: if figure := video.find_parent("figure"): figure.decompose() - for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + for hrule_imposter in cast(list[Tag], soup.find_all(class_="ilc_section_Separator")): hrule_imposter.insert(0, soup.new_tag("hr")) return soup diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 2fc399d..557150c 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -257,6 +257,7 @@ instance's greatest bottleneck. async with cl: next_stage_url: Optional[str] = url current_parent = current_element + page = None while next_stage_url: soup = await self._get_page(next_stage_url) @@ -278,6 +279,7 @@ instance's greatest bottleneck. else: next_stage_url = None + page = cast(IliasPage, page) elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string) @@ -461,10 +463,10 @@ instance's greatest bottleneck. if not dl: return - async with dl as (bar, sink): + async with dl as (_bar, sink): description = clean(insert_base_markup(description)) - description = await self.internalize_images(description) - sink.file.write(description.prettify().encode("utf-8")) + description_tag = await self.internalize_images(description) + sink.file.write(cast(str, description_tag.prettify()).encode("utf-8")) sink.done() @anoncritical @@ -483,7 +485,7 @@ instance's greatest bottleneck. async with self.session.get(export_url, allow_redirects=False) as resp: # No redirect means we were authenticated if hdrs.LOCATION not in resp.headers: - return soupify(await resp.read()).select_one("a").get("href").strip() + return soupify(await resp.read()).select_one("a").get("href").strip() # type: ignore # We are either unauthenticated or the link is not active new_url = resp.headers[hdrs.LOCATION].lower() if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url: @@ -707,6 +709,8 @@ instance's greatest bottleneck. async with cl: next_stage_url = element.url + page = None + while next_stage_url: log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") log.explain(f"URL: {next_stage_url}") @@ -719,7 +723,7 @@ instance's greatest bottleneck. else: break - download_data = page.get_download_forum_data() + download_data = cast(IliasPage, page).get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") if download_data.empty: @@ -751,8 +755,8 @@ instance's greatest bottleneck. async with maybe_dl as (bar, sink): content = "\n" - content += element.title_tag.prettify() - content += element.content_tag.prettify() + content += cast(str, element.title_tag.prettify()) + content += cast(str, element.content_tag.prettify()) sink.file.write(content.encode("utf-8")) sink.done() @@ -877,15 +881,15 @@ instance's greatest bottleneck. continue if elem.name == "img": if src := elem.attrs.get("src", None): - url = urljoin(self._base_url, src) + url = urljoin(self._base_url, cast(str, src)) if not url.startswith(self._base_url): continue log.explain(f"Internalizing {url!r}") img = await self._get_authenticated(url) elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() - if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + if elem.name == "iframe" and cast(str, elem.attrs.get("src", "")).startswith("//"): # For unknown reasons the protocol seems to be stripped. - elem.attrs["src"] = "https:" + elem.attrs["src"] + elem.attrs["src"] = "https:" + cast(str, elem.attrs["src"]) return tag def _ensure_not_seen(self, element: IliasPageElement, parent_path: PurePath) -> None: @@ -979,11 +983,11 @@ instance's greatest bottleneck. async with self.session.get(urljoin(self._base_url, "/login.php"), params=params) as request: login_page = soupify(await request.read()) - login_form = login_page.find("form", attrs={"name": "formlogin"}) + login_form = cast(Optional[Tag], login_page.find("form", attrs={"name": "formlogin"})) if login_form is None: raise CrawlError("Could not find the login form! Specified client id might be invalid.") - login_url = login_form.attrs.get("action") + login_url = cast(Optional[str], login_form.attrs.get("action")) if login_url is None: raise CrawlError("Could not find the action URL in the login form!") @@ -1004,14 +1008,14 @@ instance's greatest bottleneck. @staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages - mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") + mainbar = cast(Optional[Tag], soup.find(class_="il-maincontrols-metabar")) if mainbar is not None: - login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x is not None and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login # Personal Desktop - if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + if soup.find("a", attrs={"href": lambda x: x is not None and "block_type=pditems" in x}): return True # Video listing embeds do not have complete ILIAS html. Try to match them by diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 57c81e5..ee61cab 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union, cast +from typing import Dict, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -117,7 +117,7 @@ class IliasPageElement: @dataclass class IliasDownloadForumData: url: str - form_data: Dict[str, Union[str, List[str]]] + form_data: Dict[str, Union[str, list[str]]] empty: bool @@ -151,7 +151,7 @@ class IliasPage: return "goto.php?target=root_" in permalink return False - def get_child_elements(self) -> List[IliasPageElement]: + def get_child_elements(self) -> list[IliasPageElement]: """ Return all child page elements you can find here. """ @@ -177,10 +177,10 @@ class IliasPage: return self._find_normal_entries() def get_info_tab(self) -> Optional[IliasPageElement]: - tab: Optional[Tag] = self._soup.find( + tab: Optional[Tag] = cast(Optional[Tag], self._soup.find( name="a", - attrs={"href": lambda x: x and "cmdClass=ilinfoscreengui" in x} - ) + attrs={"href": lambda x: x is not None and "cmdClass=ilinfoscreengui" in x} + )) if tab is not None: return IliasPageElement.create_new( IliasElementType.INFO_TAB, @@ -193,7 +193,7 @@ class IliasPage: def is_interesting_class(name: str) -> bool: return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] - paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + paragraphs: list[Tag] = cast(list[Tag], self._soup.find_all(class_=is_interesting_class)) if not paragraphs: return None @@ -217,8 +217,8 @@ class IliasPage: def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: if not self._is_learning_module_page(): return None - content = self._soup.select_one("#ilLMPageContent") - title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + content = cast(Tag, self._soup.select_one("#ilLMPageContent")) + title = cast(Tag, self._soup.select_one(".ilc_page_title_PageTitle")).get_text().strip() return IliasLearningModulePage( title=title, content=content, @@ -243,15 +243,18 @@ class IliasPage: return None def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: - form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + form = cast(Optional[Tag], self._soup.find( + "form", + attrs={"action": lambda x: x is not None and "fallbackCmd=showThreads" in x} + )) if not form: return None - post_url = self._abs_url_from_relative(form["action"]) + post_url = self._abs_url_from_relative(cast(str, form["action"])) - thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + thread_ids = [f["value"] for f in cast(list[Tag], form.find_all(attrs={"name": "thread_ids[]"}))] - form_data: Dict[str, Union[str, List[str]]] = { - "thread_ids[]": thread_ids, + form_data: Dict[str, Union[str, list[str]]] = { + "thread_ids[]": cast(list[str], thread_ids), "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", @@ -285,7 +288,7 @@ class IliasPage: def _is_forum_page(self) -> bool: read_more_btn = self._soup.find( "button", - attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + attrs={"onclick": lambda x: x is not None and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} ) return read_more_btn is not None @@ -297,7 +300,7 @@ class IliasPage: return True # Raw listing without ILIAS fluff - video_element_table: Tag = self._soup.find( + video_element_table = self._soup.find( name="table", id=re.compile(r"tbl_xoct_.+") ) return video_element_table is not None @@ -305,8 +308,8 @@ class IliasPage: def _is_ilias_opencast_embedding(self) -> bool: # ILIAS fluff around the real opencast html if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "opencast" in element.attrs["src"].lower(): + element: Tag = cast(Tag, self._soup.find(id="headerimage")) + if "opencast" in cast(str, element.attrs["src"]).lower(): return True return False @@ -317,8 +320,8 @@ class IliasPage: # We have no suitable parent - let's guesss if self._soup.find(id="headerimage"): - element: Tag = self._soup.find(id="headerimage") - if "exc" in element.attrs["src"].lower(): + element: Tag = cast(Tag, self._soup.find(id="headerimage")) + if "exc" in cast(str, element.attrs["src"]).lower(): return True return False @@ -340,10 +343,10 @@ class IliasPage: return self._uncollapse_future_meetings_url() is not None def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: - element = self._soup.find( + element = cast(Optional[Tag], self._soup.find( "a", - attrs={"href": lambda x: x and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} - ) + attrs={"href": lambda x: x is not None and ("crs_next_sess=1" in x or "crs_prev_sess=1" in x)} + )) if not element: return None link = self._abs_url_from_link(element) @@ -360,24 +363,24 @@ class IliasPage: return "baseClass=ilmembershipoverviewgui" in self._page_url def _select_content_page_url(self) -> Optional[IliasPageElement]: - tab = self._soup.find( + tab = cast(Optional[Tag], self._soup.find( id="tab_view_content", attrs={"class": lambda x: x is not None and "active" not in x} - ) + )) # Already selected (or not found) if not tab: return None - link = tab.find("a") + link = cast(Optional[Tag], tab.find("a")) if link: - link = self._abs_url_from_link(link) - return IliasPageElement.create_new(IliasElementType.FOLDER, link, "select content page") + link_str = self._abs_url_from_link(link) + return IliasPageElement.create_new(IliasElementType.FOLDER, link_str, "select content page") _unexpected_html_warning() log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") log.warn_contd("PFERD might not find content on the course's main page.") return None - def _player_to_video(self) -> List[IliasPageElement]: + def _player_to_video(self) -> list[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere # on the page, but defined in a JS object inside a script tag, passed to the player @@ -414,10 +417,10 @@ class IliasPage: return items def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: - correct_link = self._soup.find( + correct_link = cast(Optional[Tag], self._soup.find( "a", - attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} - ) + attrs={"href": lambda x: x is not None and "trows=800" in x and "cmd=showThreads" in x} + )) if not correct_link: return None @@ -426,15 +429,15 @@ class IliasPage: return IliasPageElement.create_new(IliasElementType.FORUM, link, "show all forum threads") - def _find_personal_desktop_entries(self) -> List[IliasPageElement]: - items: List[IliasPageElement] = [] + def _find_personal_desktop_entries(self) -> list[IliasPageElement]: + items: list[IliasPageElement] = [] - titles: List[Tag] = self._soup.select("#block_pditems_0 .il-item-title") + titles: list[Tag] = self._soup.select("#block_pditems_0 .il-item-title") for title in titles: - link = title.find("a") + link = cast(Optional[Tag], title.find("a")) if not link: - log.explain(f"Skipping offline item: {title.getText().strip()!r}") + log.explain(f"Skipping offline item: {title.get_text().strip()!r}") continue name = _sanitize_path_name(link.text.strip()) @@ -460,13 +463,13 @@ class IliasPage: return items - def _find_copa_entries(self) -> List[IliasPageElement]: - items: List[IliasPageElement] = [] - links: List[Tag] = self._soup.findAll(class_="ilc_flist_a_FileListItemLink") + def _find_copa_entries(self) -> list[IliasPageElement]: + items: list[IliasPageElement] = [] + links: list[Tag] = cast(list[Tag], self._soup.find_all(class_="ilc_flist_a_FileListItemLink")) for link in links: url = self._abs_url_from_link(link) - name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.getText()).strip().replace("\t", "") + name = re.sub(r"\([\d,.]+ [MK]B\)", "", link.get_text()).strip().replace("\t", "") name = _sanitize_path_name(name) if "file_id" not in url: @@ -478,9 +481,9 @@ class IliasPage: return items - def _find_info_tab_entries(self) -> List[IliasPageElement]: + def _find_info_tab_entries(self) -> list[IliasPageElement]: items = [] - links: List[Tag] = self._soup.select("a.il_ContainerItemCommand") + links: list[Tag] = self._soup.select("a.il_ContainerItemCommand") for link in links: if "cmdClass=ilobjcoursegui" not in link["href"]: @@ -490,12 +493,12 @@ class IliasPage: items.append(IliasPageElement.create_new( IliasElementType.FILE, self._abs_url_from_link(link), - _sanitize_path_name(link.getText()) + _sanitize_path_name(link.get_text()) )) return items - def _find_opencast_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> list[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -503,14 +506,14 @@ class IliasPage: # # We need to figure out where we are. - video_element_table: Tag = self._soup.find( + video_element_table = cast(Optional[Tag], self._soup.find( name="table", id=re.compile(r"tbl_xoct_.+") - ) + )) if video_element_table is None: # We are in stage 1 # The page is actually emtpy but contains the link to stage 2 - content_link: Tag = self._soup.select_one("#tab_series a") + content_link: Tag = cast(Tag, self._soup.select_one("#tab_series a")) url: str = self._abs_url_from_link(content_link) query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) @@ -527,14 +530,14 @@ class IliasPage: return self._find_opencast_video_entries_no_paging() - def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: - table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) + def _find_opencast_video_entries_paginated(self) -> list[IliasPageElement]: + table_element = cast(Optional[Tag], self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+"))) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") return self._find_opencast_video_entries_no_paging() - id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) + id_match = re.match(r"tbl_xoct_(.+)", cast(str, table_element.attrs["id"])) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") return self._find_opencast_video_entries_no_paging() @@ -548,16 +551,16 @@ class IliasPage: log.explain("Disabled pagination, retrying folder as a new entry") return [IliasPageElement.create_new(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> list[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ # Video start links are marked with an "Abspielen" link - video_links: List[Tag] = self._soup.findAll( + video_links = cast(list[Tag], self._soup.find_all( name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") - ) + )) - results: List[IliasPageElement] = [] + results: list[IliasPageElement] = [] for link in video_links: results.append(self._listed_opencast_video_to_element(link)) @@ -569,12 +572,12 @@ class IliasPage: # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date modification_time = None - row: Tag = link.parent.parent.parent + row: Tag = link.parent.parent.parent # type: ignore column_count = len(row.select("td.std")) for index in range(column_count, 0, -1): - modification_string = link.parent.parent.parent.select_one( + modification_string = link.parent.parent.parent.select_one( # type: ignore f"td.std:nth-child({index})" - ).getText().strip() + ).get_text().strip() if match := re.search(r"\d+\.\d+.\d+ \d+:\d+", modification_string): modification_time = datetime.strptime(match.group(0), "%d.%m.%Y %H:%M") break @@ -583,7 +586,7 @@ class IliasPage: log.warn(f"Could not determine upload time for {link}") modification_time = datetime.now() - title = link.parent.parent.parent.select_one("td.std:nth-child(3)").getText().strip() + title = link.parent.parent.parent.select_one("td.std:nth-child(3)").get_text().strip() # type: ignore title += ".mp4" video_name: str = _sanitize_path_name(title) @@ -595,33 +598,34 @@ class IliasPage: IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time ) - def _find_exercise_entries(self) -> List[IliasPageElement]: + def _find_exercise_entries(self) -> list[IliasPageElement]: if self._soup.find(id="tab_submission"): log.explain("Found submission tab. This is an exercise detail page") return self._find_exercise_entries_detail_page() log.explain("Found no submission tab. This is an exercise root page") return self._find_exercise_entries_root_page() - def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: - results: List[IliasPageElement] = [] + def _find_exercise_entries_detail_page(self) -> list[IliasPageElement]: + results: list[IliasPageElement] = [] # Find all download links in the container (this will contain all the files) - download_links: List[Tag] = self._soup.findAll( + download_links = cast(list[Tag], self._soup.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmd=download" in x}, + attrs={"href": lambda x: x is not None and "cmd=download" in x}, text="Download" - ) + )) for link in download_links: - parent_row: Tag = link.findParent("tr") - children: List[Tag] = parent_row.findChildren("td") + parent_row: Tag = cast(Tag, link.find_parent("tr")) + children = cast(list[Tag], parent_row.find_all("td")) - name = _sanitize_path_name(children[1].getText().strip()) + name = _sanitize_path_name(children[1].get_text().strip()) log.explain(f"Found exercise detail entry {name!r}") + date = None for child in reversed(children): - date = demangle_date(child.getText().strip(), fail_silently=True) + date = demangle_date(child.get_text().strip(), fail_silently=True) if date is not None: break if date is None: @@ -636,30 +640,33 @@ class IliasPage: return results - def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: - results: List[IliasPageElement] = [] + def _find_exercise_entries_root_page(self) -> list[IliasPageElement]: + results: list[IliasPageElement] = [] # Each assignment is in an accordion container - assignment_containers: List[Tag] = self._soup.select(".il_VAccordionInnerContainer") + assignment_containers: list[Tag] = self._soup.select(".il_VAccordionInnerContainer") for container in assignment_containers: # Fetch the container name out of the header to use it in the path - container_name = container.select_one(".ilAssignmentHeader").getText().strip() + container_name = cast(Tag, container.select_one(".ilAssignmentHeader")).get_text().strip() log.explain(f"Found exercise container {container_name!r}") # Find all download links in the container (this will contain all the files) - files: List[Tag] = container.findAll( + files = cast(list[Tag], container.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x}, + attrs={"href": lambda x: x is not None and "cmdClass=ilexsubmissiongui" in x}, text="Download" - ) + )) # Grab each file as you now have the link for file_link in files: # Two divs, side by side. Left is the name, right is the link ==> get left # sibling - file_name = file_link.parent.findPrevious(name="div").getText().strip() + file_name = cast( + Tag, + cast(Tag, file_link.parent).find_previous(name="div") + ).get_text().strip() url = self._abs_url_from_link(file_link) log.explain(f"Found exercise entry {file_name!r}") @@ -672,21 +679,21 @@ class IliasPage: )) # Find all links to file listings (e.g. "Submitted Files" for groups) - file_listings: List[Tag] = container.findAll( + file_listings = cast(list[Tag], container.find_all( name="a", # download links contain the given command class - attrs={"href": lambda x: x and "cmdclass=ilexsubmissionfilegui" in x.lower()} - ) + attrs={"href": lambda x: x is not None and "cmdclass=ilexsubmissionfilegui" in x.lower()} + )) # Add each listing as a new for listing in file_listings: - parent_container: Tag = listing.findParent( - "div", attrs={"class": lambda x: x and "form-group" in x} - ) - label_container: Tag = parent_container.find( - attrs={"class": lambda x: x and "control-label" in x} - ) - file_name = label_container.getText().strip() + parent_container = cast(Tag, listing.find_parent( + "div", attrs={"class": lambda x: x is not None and "form-group" in x} + )) + label_container = cast(Tag, parent_container.find( + attrs={"class": lambda x: x is not None and "control-label" in x} + )) + file_name = label_container.get_text().strip() url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") results.append(IliasPageElement.create_new( @@ -699,10 +706,10 @@ class IliasPage: return results - def _find_normal_entries(self) -> List[IliasPageElement]: - result: List[IliasPageElement] = [] + def _find_normal_entries(self) -> list[IliasPageElement]: + result: list[IliasPageElement] = [] - links: List[Tag] = [] + links: list[Tag] = [] # Fetch all links and throw them to the general interpreter if self._is_course_overview_page(): log.explain("Page is a course overview page, adjusting link selector") @@ -716,9 +723,9 @@ class IliasPage: parents = [_sanitize_path_name(x) for x in self._find_upwards_folder_hierarchy(link)] if parents: - element_name = "/".join(parents) + "/" + _sanitize_path_name(link.getText()) + element_name = "/".join(parents) + "/" + _sanitize_path_name(link.get_text()) else: - element_name = _sanitize_path_name(link.getText()) + element_name = _sanitize_path_name(link.get_text()) element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) @@ -750,17 +757,17 @@ class IliasPage: return result - def _find_mediacast_videos(self) -> List[IliasPageElement]: - videos: List[IliasPageElement] = [] + def _find_mediacast_videos(self) -> list[IliasPageElement]: + videos: list[IliasPageElement] = [] - for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + for elem in cast(list[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): element_name = _sanitize_path_name( - elem.select_one(".ilPlayerPreviewDescription").getText().strip() + cast(Tag, elem.select_one(".ilPlayerPreviewDescription")).get_text().strip() ) if not element_name.endswith(".mp4"): # just to make sure it has some kinda-alrightish ending element_name = element_name + ".mp4" - video_element = elem.find(name="video") + video_element = cast(Optional[Tag], elem.find(name="video")) if not video_element: _unexpected_html_warning() log.warn_contd(f"No