From 998abc3afd10b7ae387d5a4b1728c17fb047a362 Mon Sep 17 00:00:00 2001 From: PinieP <59698589+PinieP@users.noreply.github.com> Date: Sat, 2 Nov 2024 21:54:57 +0100 Subject: [PATCH] Fix line wrapping --- PFERD/crawl/ilias/ilias_web_crawler.py | 87 +++++++++----------------- PFERD/crawl/ilias/shibboleth_login.py | 3 +- 2 files changed, 30 insertions(+), 60 deletions(-) diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index fe94d39..e6887c3 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection): self.missing_value("client_id") return LoginTypeLocal(client_id) - self.invalid_value("login_type", login_type, - "Should be ") + self.invalid_value("login_type", login_type, "Should be ") def tfa_auth( self, authenticators: Dict[str, Authenticator] @@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection): return None auth = authenticators.get(value) if auth is None: - self.invalid_value("tfa_auth", value, - "No such auth section exists") + self.invalid_value("tfa_auth", value, "No such auth section exists") return auth def target(self) -> TargetType: @@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection): # URL return target - self.invalid_value( - "target", target, "Should be ") + self.invalid_value("target", target, "Should be ") def links(self) -> Links: type_str: Optional[str] = self.s.get("links") @@ -200,8 +197,7 @@ instance's greatest bottleneck. async def _run(self) -> None: if isinstance(self._target, int): - log.explain_topic( - f"Inferred crawl target: Course with id {self._target}") + log.explain_topic(f"Inferred crawl target: Course with id {self._target}") await self._crawl_course(self._target) elif self._target == "desktop": log.explain_topic("Inferred crawl target: Personal desktop") @@ -400,8 +396,7 @@ instance's greatest bottleneck. return None else: log.explain("Answer: Yes") - element_path = element_path.with_name( - element_path.name + link_extension) + element_path = element_path.with_name(element_path.name + link_extension) maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: @@ -413,11 +408,9 @@ instance's greatest bottleneck. @_iorepeat(3, "resolving link") async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async with dl as (bar, sink): - export_url = element.url.replace( - "cmd=calldirectlink", "cmd=exportHTML") + export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) - self._write_link_content( - link_template, real_url, element.name, element.description, sink) + self._write_link_content(link_template, real_url, element.name, element.description, sink) def _write_link_content( self, @@ -431,8 +424,7 @@ instance's greatest bottleneck. content = content.replace("{{link}}", url) content = content.replace("{{name}}", name) content = content.replace("{{description}}", str(description)) - content = content.replace( - "{{redirect_delay}}", str(self._link_file_redirect_delay)) + content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay)) sink.file.write(content.encode("utf-8")) sink.done() @@ -451,8 +443,7 @@ instance's greatest bottleneck. return None else: log.explain("Answer: Yes") - element_path = element_path.with_name( - element_path.name + link_extension) + element_path = element_path.with_name(element_path.name + link_extension) maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: @@ -485,8 +476,7 @@ instance's greatest bottleneck. dl: DownloadToken, ) -> None: async with dl as (bar, sink): - self._write_link_content( - link_template, element.url, element.name, element.description, sink) + self._write_link_content(link_template, element.url, element.name, element.description, sink) async def _resolve_link_target(self, export_url: str) -> str: async def impl() -> Optional[str]: @@ -510,8 +500,7 @@ instance's greatest bottleneck. if target is not None: return target - raise CrawlError( - "resolve_link_target failed even after authenticating") + raise CrawlError("resolve_link_target failed even after authenticating") async def _handle_opencast_video( self, @@ -522,8 +511,7 @@ instance's greatest bottleneck. if self.prev_report: self.report.add_custom_value( _get_video_cache_key(element), - self.prev_report.get_custom_value( - _get_video_cache_key(element)) + self.prev_report.get_custom_value(_get_video_cache_key(element)) ) # A video might contain other videos, so let's "crawl" the video first @@ -544,8 +532,7 @@ instance's greatest bottleneck. # Mark all existing videos as known to ensure they do not get deleted during cleanup. # We "downloaded" them, just without actually making a network request as we assumed # they did not change. - contained = self._previous_contained_opencast_videos( - element, maybe_dl.path) + contained = self._previous_contained_opencast_videos(element, maybe_dl.path) if len(contained) > 1: # Only do this if we threw away the original dl token, # to not download single-stream videos twice @@ -561,8 +548,7 @@ instance's greatest bottleneck. ) -> List[PurePath]: if not self.prev_report: return [] - custom_value = self.prev_report.get_custom_value( - _get_video_cache_key(element)) + custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element)) if not custom_value: return [] cached_value = cast(dict[str, Any], custom_value) @@ -584,11 +570,9 @@ instance's greatest bottleneck. f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}" ) if all(self._output_dir.resolve(path).exists() for path in contained_videos): - log.explain( - "Found all known videos locally, skipping enumeration request") + log.explain("Found all known videos locally, skipping enumeration request") return True - log.explain( - "Missing at least one video, continuing with requests!") + log.explain("Missing at least one video, continuing with requests!") else: log.explain("No local cache present") return False @@ -599,8 +583,7 @@ instance's greatest bottleneck. def add_to_report(paths: list[str]) -> None: self.report.add_custom_value( _get_video_cache_key(element), - {"known_paths": paths, "own_path": str( - self._transformer.transform(dl.path))} + {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))} ) async with dl as (bar, sink): @@ -628,8 +611,7 @@ instance's greatest bottleneck. continue async with maybe_dl as (bar, sink): log.explain(f"Streaming video from real url {stream_element.url}") - contained_video_paths.append( - str(self._transformer.transform(maybe_dl.path))) + contained_video_paths.append(str(self._transformer.transform(maybe_dl.path))) await self._stream_from_url(stream_element.url, sink, bar, is_video=True) add_to_report(contained_video_paths) @@ -749,8 +731,7 @@ instance's greatest bottleneck. tasks: List[Awaitable[None]] = [] for elem in elements: - tasks.append(asyncio.create_task( - self._download_forum_thread(cl.path, elem))) + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) # And execute them await self.gather(tasks) @@ -811,11 +792,9 @@ instance's greatest bottleneck. tasks: List[Awaitable[None]] = [] for index, elem in enumerate(elements): prev_url = elements[index - 1].title if index > 0 else None - next_url = elements[index + - 1].title if index < len(elements) - 1 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None tasks.append(asyncio.create_task( - self._download_learning_module_page( - cl.path, elem, prev_url, next_url) + self._download_learning_module_page(cl.path, elem, prev_url, next_url) )) # And execute them @@ -868,15 +847,13 @@ instance's greatest bottleneck. return if prev: - prev_p = self._transformer.transform( - parent_path / (_sanitize_path_name(prev) + ".html")) + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) if prev_p: prev = os.path.relpath(prev_p, my_path.parent) else: prev = None if next: - next_p = self._transformer.transform( - parent_path / (_sanitize_path_name(next) + ".html")) + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) if next_p: next = os.path.relpath(next_p, my_path.parent) else: @@ -885,8 +862,7 @@ instance's greatest bottleneck. async with maybe_dl as (bar, sink): content = element.content content = await self.internalize_images(content) - sink.file.write(learning_module_template( - content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) sink.done() async def internalize_images(self, tag: Tag) -> Tag: @@ -904,8 +880,7 @@ instance's greatest bottleneck. continue log.explain(f"Internalizing {url!r}") img = await self._get_authenticated(url) - elem.attrs["src"] = "data:;base64," + \ - base64.b64encode(img).decode() + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): # For unknown reasons the protocol seems to be stripped. elem.attrs["src"] = "https:" + elem.attrs["src"] @@ -935,8 +910,7 @@ instance's greatest bottleneck. soup = soupify(await request.read()) if IliasPage.is_logged_in(soup): return self._verify_page(soup, url, root_page_allowed) - raise CrawlError( - f"get_page failed even after authenticating on {url!r}") + raise CrawlError(f"get_page failed even after authenticating on {url!r}") @staticmethod def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: @@ -1007,13 +981,11 @@ instance's greatest bottleneck. login_form = login_page.find("form", attrs={"name": "formlogin"}) if login_form is None: - raise CrawlError( - "Could not find the login form! Specified client id might be invalid.") + raise CrawlError("Could not find the login form! Specified client id might be invalid.") login_url = login_form.attrs.get("action") if login_url is None: - raise CrawlError( - "Could not find the action URL in the login form!") + raise CrawlError("Could not find the action URL in the login form!") username, password = await self._auth.credentials() @@ -1034,8 +1006,7 @@ instance's greatest bottleneck. # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: - login_button = mainbar.find( - attrs={"href": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login diff --git a/PFERD/crawl/ilias/shibboleth_login.py b/PFERD/crawl/ilias/shibboleth_login.py index 42257bd..28f23c2 100644 --- a/PFERD/crawl/ilias/shibboleth_login.py +++ b/PFERD/crawl/ilias/shibboleth_login.py @@ -48,8 +48,7 @@ class ShibbolethLogin: while not self._login_successful(soup): # Searching the form here so that this fails before asking for # credentials rather than after asking. - form = soup.find( - "form", {"method": "post"}) + form = soup.find("form", {"method": "post"}) action = form["action"] # Equivalent: Enter credentials in