Fix line wrapping

2026-04-12 15:35:05 +02:00 · 2024-11-02 21:54:57 +01:00 · 2024-11-02 21:54:57 +01:00 · 998abc3afd
commit 998abc3afd
parent 27667822eb
2 changed files with 30 additions and 60 deletions
--- a/PFERD/crawl/ilias/ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/ilias_web_crawler.py
@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
                self.missing_value("client_id")
            return LoginTypeLocal(client_id)
-        self.invalid_value("login_type", login_type,
+        self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
                           "Should be <shibboleth | local>")
    def tfa_auth(
        self, authenticators: Dict[str, Authenticator]
@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
            return None
        auth = authenticators.get(value)
        if auth is None:
-            self.invalid_value("tfa_auth", value,
+            self.invalid_value("tfa_auth", value, "No such auth section exists")
                               "No such auth section exists")
        return auth
    def target(self) -> TargetType:
@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
            # URL
            return target
-        self.invalid_value(
+        self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
            "target", target, "Should be <course id | desktop | kit ilias URL>")
    def links(self) -> Links:
        type_str: Optional[str] = self.s.get("links")
@ -200,8 +197,7 @@ instance's greatest bottleneck.
    async def _run(self) -> None:
        if isinstance(self._target, int):
-            log.explain_topic(
+            log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
                f"Inferred crawl target: Course with id {self._target}")
            await self._crawl_course(self._target)
        elif self._target == "desktop":
            log.explain_topic("Inferred crawl target: Personal desktop")
@ -400,8 +396,7 @@ instance's greatest bottleneck.
            return None
        else:
            log.explain("Answer: Yes")
-        element_path = element_path.with_name(
+        element_path = element_path.with_name(element_path.name + link_extension)
            element_path.name + link_extension)
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
@ -413,11 +408,9 @@ instance's greatest bottleneck.
    @_iorepeat(3, "resolving link")
    async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
        async with dl as (bar, sink):
-            export_url = element.url.replace(
+            export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
                "cmd=calldirectlink", "cmd=exportHTML")
            real_url = await self._resolve_link_target(export_url)
-            self._write_link_content(
+            self._write_link_content(link_template, real_url, element.name, element.description, sink)
                link_template, real_url, element.name, element.description, sink)
    def _write_link_content(
        self,
@ -431,8 +424,7 @@ instance's greatest bottleneck.
        content = content.replace("{{link}}", url)
        content = content.replace("{{name}}", name)
        content = content.replace("{{description}}", str(description))
-        content = content.replace(
+        content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
            "{{redirect_delay}}", str(self._link_file_redirect_delay))
        sink.file.write(content.encode("utf-8"))
        sink.done()
@ -451,8 +443,7 @@ instance's greatest bottleneck.
            return None
        else:
            log.explain("Answer: Yes")
-        element_path = element_path.with_name(
+        element_path = element_path.with_name(element_path.name + link_extension)
            element_path.name + link_extension)
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
@ -485,8 +476,7 @@ instance's greatest bottleneck.
        dl: DownloadToken,
    ) -> None:
        async with dl as (bar, sink):
-            self._write_link_content(
+            self._write_link_content(link_template, element.url, element.name, element.description, sink)
                link_template, element.url, element.name, element.description, sink)
    async def _resolve_link_target(self, export_url: str) -> str:
        async def impl() -> Optional[str]:
@ -510,8 +500,7 @@ instance's greatest bottleneck.
        if target is not None:
            return target
-        raise CrawlError(
+        raise CrawlError("resolve_link_target failed even after authenticating")
            "resolve_link_target failed even after authenticating")
    async def _handle_opencast_video(
        self,
@ -522,8 +511,7 @@ instance's greatest bottleneck.
        if self.prev_report:
            self.report.add_custom_value(
                _get_video_cache_key(element),
-                self.prev_report.get_custom_value(
+                self.prev_report.get_custom_value(_get_video_cache_key(element))
                    _get_video_cache_key(element))
            )
        # A video might contain other videos, so let's "crawl" the video first
@ -544,8 +532,7 @@ instance's greatest bottleneck.
            # Mark all existing videos as known to ensure they do not get deleted during cleanup.
            # We "downloaded" them, just without actually making a network request as we assumed
            # they did not change.
-            contained = self._previous_contained_opencast_videos(
+            contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
                element, maybe_dl.path)
            if len(contained) > 1:
                # Only do this if we threw away the original dl token,
                # to not download single-stream videos twice
@ -561,8 +548,7 @@ instance's greatest bottleneck.
    ) -> List[PurePath]:
        if not self.prev_report:
            return []
-        custom_value = self.prev_report.get_custom_value(
+        custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
            _get_video_cache_key(element))
        if not custom_value:
            return []
        cached_value = cast(dict[str, Any], custom_value)
@ -584,11 +570,9 @@ instance's greatest bottleneck.
                f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
            )
            if all(self._output_dir.resolve(path).exists() for path in contained_videos):
-                log.explain(
+                log.explain("Found all known videos locally, skipping enumeration request")
                    "Found all known videos locally, skipping enumeration request")
                return True
-            log.explain(
+            log.explain("Missing at least one video, continuing with requests!")
                "Missing at least one video, continuing with requests!")
        else:
            log.explain("No local cache present")
        return False
@ -599,8 +583,7 @@ instance's greatest bottleneck.
        def add_to_report(paths: list[str]) -> None:
            self.report.add_custom_value(
                _get_video_cache_key(element),
-                {"known_paths": paths, "own_path": str(
+                {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
                    self._transformer.transform(dl.path))}
            )
        async with dl as (bar, sink):
@ -628,8 +611,7 @@ instance's greatest bottleneck.
                continue
            async with maybe_dl as (bar, sink):
                log.explain(f"Streaming video from real url {stream_element.url}")
-                contained_video_paths.append(
+                contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
                    str(self._transformer.transform(maybe_dl.path)))
                await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
        add_to_report(contained_video_paths)
@ -749,8 +731,7 @@ instance's greatest bottleneck.
        tasks: List[Awaitable[None]] = []
        for elem in elements:
-            tasks.append(asyncio.create_task(
+            tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
                self._download_forum_thread(cl.path, elem)))
        # And execute them
        await self.gather(tasks)
@ -811,11 +792,9 @@ instance's greatest bottleneck.
        tasks: List[Awaitable[None]] = []
        for index, elem in enumerate(elements):
            prev_url = elements[index - 1].title if index > 0 else None
-            next_url = elements[index +
+            next_url = elements[index + 1].title if index < len(elements) - 1 else None
                                1].title if index < len(elements) - 1 else None
            tasks.append(asyncio.create_task(
-                self._download_learning_module_page(
+                self._download_learning_module_page(cl.path, elem, prev_url, next_url)
                    cl.path, elem, prev_url, next_url)
            ))
        # And execute them
@ -868,15 +847,13 @@ instance's greatest bottleneck.
            return
        if prev:
-            prev_p = self._transformer.transform(
+            prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
                parent_path / (_sanitize_path_name(prev) + ".html"))
            if prev_p:
                prev = os.path.relpath(prev_p, my_path.parent)
            else:
                prev = None
        if next:
-            next_p = self._transformer.transform(
+            next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
                parent_path / (_sanitize_path_name(next) + ".html"))
            if next_p:
                next = os.path.relpath(next_p, my_path.parent)
            else:
@ -885,8 +862,7 @@ instance's greatest bottleneck.
        async with maybe_dl as (bar, sink):
            content = element.content
            content = await self.internalize_images(content)
-            sink.file.write(learning_module_template(
+            sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
                content, maybe_dl.path.name, prev, next).encode("utf-8"))
            sink.done()
    async def internalize_images(self, tag: Tag) -> Tag:
@ -904,8 +880,7 @@ instance's greatest bottleneck.
                        continue
                    log.explain(f"Internalizing {url!r}")
                    img = await self._get_authenticated(url)
-                    elem.attrs["src"] = "data:;base64," + \
+                    elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
                        base64.b64encode(img).decode()
            if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
                # For unknown reasons the protocol seems to be stripped.
                elem.attrs["src"] = "https:" + elem.attrs["src"]
@ -935,8 +910,7 @@ instance's greatest bottleneck.
            soup = soupify(await request.read())
            if IliasPage.is_logged_in(soup):
                return self._verify_page(soup, url, root_page_allowed)
-        raise CrawlError(
+        raise CrawlError(f"get_page failed even after authenticating on {url!r}")
            f"get_page failed even after authenticating on {url!r}")
    @staticmethod
    def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
@ -1007,13 +981,11 @@ instance's greatest bottleneck.
            login_form = login_page.find("form", attrs={"name": "formlogin"})
            if login_form is None:
-                raise CrawlError(
+                raise CrawlError("Could not find the login form! Specified client id might be invalid.")
                    "Could not find the login form! Specified client id might be invalid.")
            login_url = login_form.attrs.get("action")
            if login_url is None:
-                raise CrawlError(
+                raise CrawlError("Could not find the action URL in the login form!")
                    "Could not find the action URL in the login form!")
            username, password = await self._auth.credentials()
@ -1034,8 +1006,7 @@ instance's greatest bottleneck.
        # Normal ILIAS pages
        mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
        if mainbar is not None:
-            login_button = mainbar.find(
+            login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
                attrs={"href": lambda x: x and "login.php" in x})
            shib_login = soup.find(id="button_shib_login")
            return not login_button and not shib_login
--- a/PFERD/crawl/ilias/shibboleth_login.py
+++ b/PFERD/crawl/ilias/shibboleth_login.py
@ -48,8 +48,7 @@ class ShibbolethLogin:
        while not self._login_successful(soup):
            # Searching the form here so that this fails before asking for
            # credentials rather than after asking.
-            form = soup.find(
+            form = soup.find("form", {"method": "post"})
                "form", {"method": "post"})
            action = form["action"]
            # Equivalent: Enter credentials in