Fix line wrapping

This commit is contained in:
PinieP 2024-11-02 21:54:57 +01:00 committed by I-Al-Istannen
parent 27667822eb
commit 998abc3afd
2 changed files with 30 additions and 60 deletions

View file

@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
self.missing_value("client_id")
return LoginTypeLocal(client_id)
self.invalid_value("login_type", login_type,
"Should be <shibboleth | local>")
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
def tfa_auth(
self, authenticators: Dict[str, Authenticator]
@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
return None
auth = authenticators.get(value)
if auth is None:
self.invalid_value("tfa_auth", value,
"No such auth section exists")
self.invalid_value("tfa_auth", value, "No such auth section exists")
return auth
def target(self) -> TargetType:
@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
# URL
return target
self.invalid_value(
"target", target, "Should be <course id | desktop | kit ilias URL>")
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
def links(self) -> Links:
type_str: Optional[str] = self.s.get("links")
@ -200,8 +197,7 @@ instance's greatest bottleneck.
async def _run(self) -> None:
if isinstance(self._target, int):
log.explain_topic(
f"Inferred crawl target: Course with id {self._target}")
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
await self._crawl_course(self._target)
elif self._target == "desktop":
log.explain_topic("Inferred crawl target: Personal desktop")
@ -400,8 +396,7 @@ instance's greatest bottleneck.
return None
else:
log.explain("Answer: Yes")
element_path = element_path.with_name(
element_path.name + link_extension)
element_path = element_path.with_name(element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
@ -413,11 +408,9 @@ instance's greatest bottleneck.
@_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
async with dl as (bar, sink):
export_url = element.url.replace(
"cmd=calldirectlink", "cmd=exportHTML")
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url)
self._write_link_content(
link_template, real_url, element.name, element.description, sink)
self._write_link_content(link_template, real_url, element.name, element.description, sink)
def _write_link_content(
self,
@ -431,8 +424,7 @@ instance's greatest bottleneck.
content = content.replace("{{link}}", url)
content = content.replace("{{name}}", name)
content = content.replace("{{description}}", str(description))
content = content.replace(
"{{redirect_delay}}", str(self._link_file_redirect_delay))
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8"))
sink.done()
@ -451,8 +443,7 @@ instance's greatest bottleneck.
return None
else:
log.explain("Answer: Yes")
element_path = element_path.with_name(
element_path.name + link_extension)
element_path = element_path.with_name(element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl:
@ -485,8 +476,7 @@ instance's greatest bottleneck.
dl: DownloadToken,
) -> None:
async with dl as (bar, sink):
self._write_link_content(
link_template, element.url, element.name, element.description, sink)
self._write_link_content(link_template, element.url, element.name, element.description, sink)
async def _resolve_link_target(self, export_url: str) -> str:
async def impl() -> Optional[str]:
@ -510,8 +500,7 @@ instance's greatest bottleneck.
if target is not None:
return target
raise CrawlError(
"resolve_link_target failed even after authenticating")
raise CrawlError("resolve_link_target failed even after authenticating")
async def _handle_opencast_video(
self,
@ -522,8 +511,7 @@ instance's greatest bottleneck.
if self.prev_report:
self.report.add_custom_value(
_get_video_cache_key(element),
self.prev_report.get_custom_value(
_get_video_cache_key(element))
self.prev_report.get_custom_value(_get_video_cache_key(element))
)
# A video might contain other videos, so let's "crawl" the video first
@ -544,8 +532,7 @@ instance's greatest bottleneck.
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
# We "downloaded" them, just without actually making a network request as we assumed
# they did not change.
contained = self._previous_contained_opencast_videos(
element, maybe_dl.path)
contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
if len(contained) > 1:
# Only do this if we threw away the original dl token,
# to not download single-stream videos twice
@ -561,8 +548,7 @@ instance's greatest bottleneck.
) -> List[PurePath]:
if not self.prev_report:
return []
custom_value = self.prev_report.get_custom_value(
_get_video_cache_key(element))
custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
if not custom_value:
return []
cached_value = cast(dict[str, Any], custom_value)
@ -584,11 +570,9 @@ instance's greatest bottleneck.
f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
)
if all(self._output_dir.resolve(path).exists() for path in contained_videos):
log.explain(
"Found all known videos locally, skipping enumeration request")
log.explain("Found all known videos locally, skipping enumeration request")
return True
log.explain(
"Missing at least one video, continuing with requests!")
log.explain("Missing at least one video, continuing with requests!")
else:
log.explain("No local cache present")
return False
@ -599,8 +583,7 @@ instance's greatest bottleneck.
def add_to_report(paths: list[str]) -> None:
self.report.add_custom_value(
_get_video_cache_key(element),
{"known_paths": paths, "own_path": str(
self._transformer.transform(dl.path))}
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
)
async with dl as (bar, sink):
@ -628,8 +611,7 @@ instance's greatest bottleneck.
continue
async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append(
str(self._transformer.transform(maybe_dl.path)))
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
add_to_report(contained_video_paths)
@ -749,8 +731,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = []
for elem in elements:
tasks.append(asyncio.create_task(
self._download_forum_thread(cl.path, elem)))
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
# And execute them
await self.gather(tasks)
@ -811,11 +792,9 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = []
for index, elem in enumerate(elements):
prev_url = elements[index - 1].title if index > 0 else None
next_url = elements[index +
1].title if index < len(elements) - 1 else None
next_url = elements[index + 1].title if index < len(elements) - 1 else None
tasks.append(asyncio.create_task(
self._download_learning_module_page(
cl.path, elem, prev_url, next_url)
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
))
# And execute them
@ -868,15 +847,13 @@ instance's greatest bottleneck.
return
if prev:
prev_p = self._transformer.transform(
parent_path / (_sanitize_path_name(prev) + ".html"))
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p:
prev = os.path.relpath(prev_p, my_path.parent)
else:
prev = None
if next:
next_p = self._transformer.transform(
parent_path / (_sanitize_path_name(next) + ".html"))
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
if next_p:
next = os.path.relpath(next_p, my_path.parent)
else:
@ -885,8 +862,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink):
content = element.content
content = await self.internalize_images(content)
sink.file.write(learning_module_template(
content, maybe_dl.path.name, prev, next).encode("utf-8"))
sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
sink.done()
async def internalize_images(self, tag: Tag) -> Tag:
@ -904,8 +880,7 @@ instance's greatest bottleneck.
continue
log.explain(f"Internalizing {url!r}")
img = await self._get_authenticated(url)
elem.attrs["src"] = "data:;base64," + \
base64.b64encode(img).decode()
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
# For unknown reasons the protocol seems to be stripped.
elem.attrs["src"] = "https:" + elem.attrs["src"]
@ -935,8 +910,7 @@ instance's greatest bottleneck.
soup = soupify(await request.read())
if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed)
raise CrawlError(
f"get_page failed even after authenticating on {url!r}")
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
@staticmethod
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
@ -1007,13 +981,11 @@ instance's greatest bottleneck.
login_form = login_page.find("form", attrs={"name": "formlogin"})
if login_form is None:
raise CrawlError(
"Could not find the login form! Specified client id might be invalid.")
raise CrawlError("Could not find the login form! Specified client id might be invalid.")
login_url = login_form.attrs.get("action")
if login_url is None:
raise CrawlError(
"Could not find the action URL in the login form!")
raise CrawlError("Could not find the action URL in the login form!")
username, password = await self._auth.credentials()
@ -1034,8 +1006,7 @@ instance's greatest bottleneck.
# Normal ILIAS pages
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
if mainbar is not None:
login_button = mainbar.find(
attrs={"href": lambda x: x and "login.php" in x})
login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
shib_login = soup.find(id="button_shib_login")
return not login_button and not shib_login

View file

@ -48,8 +48,7 @@ class ShibbolethLogin:
while not self._login_successful(soup):
# Searching the form here so that this fails before asking for
# credentials rather than after asking.
form = soup.find(
"form", {"method": "post"})
form = soup.find("form", {"method": "post"})
action = form["action"]
# Equivalent: Enter credentials in