Fix line wrapping

This commit is contained in:
PinieP 2024-11-02 21:54:57 +01:00 committed by I-Al-Istannen
parent 27667822eb
commit 998abc3afd
2 changed files with 30 additions and 60 deletions

View file

@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
self.missing_value("client_id") self.missing_value("client_id")
return LoginTypeLocal(client_id) return LoginTypeLocal(client_id)
self.invalid_value("login_type", login_type, self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
"Should be <shibboleth | local>")
def tfa_auth( def tfa_auth(
self, authenticators: Dict[str, Authenticator] self, authenticators: Dict[str, Authenticator]
@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
return None return None
auth = authenticators.get(value) auth = authenticators.get(value)
if auth is None: if auth is None:
self.invalid_value("tfa_auth", value, self.invalid_value("tfa_auth", value, "No such auth section exists")
"No such auth section exists")
return auth return auth
def target(self) -> TargetType: def target(self) -> TargetType:
@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
# URL # URL
return target return target
self.invalid_value( self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
"target", target, "Should be <course id | desktop | kit ilias URL>")
def links(self) -> Links: def links(self) -> Links:
type_str: Optional[str] = self.s.get("links") type_str: Optional[str] = self.s.get("links")
@ -200,8 +197,7 @@ instance's greatest bottleneck.
async def _run(self) -> None: async def _run(self) -> None:
if isinstance(self._target, int): if isinstance(self._target, int):
log.explain_topic( log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
f"Inferred crawl target: Course with id {self._target}")
await self._crawl_course(self._target) await self._crawl_course(self._target)
elif self._target == "desktop": elif self._target == "desktop":
log.explain_topic("Inferred crawl target: Personal desktop") log.explain_topic("Inferred crawl target: Personal desktop")
@ -400,8 +396,7 @@ instance's greatest bottleneck.
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name( element_path = element_path.with_name(element_path.name + link_extension)
element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl: if not maybe_dl:
@ -413,11 +408,9 @@ instance's greatest bottleneck.
@_iorepeat(3, "resolving link") @_iorepeat(3, "resolving link")
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None: async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
export_url = element.url.replace( export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
"cmd=calldirectlink", "cmd=exportHTML")
real_url = await self._resolve_link_target(export_url) real_url = await self._resolve_link_target(export_url)
self._write_link_content( self._write_link_content(link_template, real_url, element.name, element.description, sink)
link_template, real_url, element.name, element.description, sink)
def _write_link_content( def _write_link_content(
self, self,
@ -431,8 +424,7 @@ instance's greatest bottleneck.
content = content.replace("{{link}}", url) content = content.replace("{{link}}", url)
content = content.replace("{{name}}", name) content = content.replace("{{name}}", name)
content = content.replace("{{description}}", str(description)) content = content.replace("{{description}}", str(description))
content = content.replace( content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
"{{redirect_delay}}", str(self._link_file_redirect_delay))
sink.file.write(content.encode("utf-8")) sink.file.write(content.encode("utf-8"))
sink.done() sink.done()
@ -451,8 +443,7 @@ instance's greatest bottleneck.
return None return None
else: else:
log.explain("Answer: Yes") log.explain("Answer: Yes")
element_path = element_path.with_name( element_path = element_path.with_name(element_path.name + link_extension)
element_path.name + link_extension)
maybe_dl = await self.download(element_path, mtime=element.mtime) maybe_dl = await self.download(element_path, mtime=element.mtime)
if not maybe_dl: if not maybe_dl:
@ -485,8 +476,7 @@ instance's greatest bottleneck.
dl: DownloadToken, dl: DownloadToken,
) -> None: ) -> None:
async with dl as (bar, sink): async with dl as (bar, sink):
self._write_link_content( self._write_link_content(link_template, element.url, element.name, element.description, sink)
link_template, element.url, element.name, element.description, sink)
async def _resolve_link_target(self, export_url: str) -> str: async def _resolve_link_target(self, export_url: str) -> str:
async def impl() -> Optional[str]: async def impl() -> Optional[str]:
@ -510,8 +500,7 @@ instance's greatest bottleneck.
if target is not None: if target is not None:
return target return target
raise CrawlError( raise CrawlError("resolve_link_target failed even after authenticating")
"resolve_link_target failed even after authenticating")
async def _handle_opencast_video( async def _handle_opencast_video(
self, self,
@ -522,8 +511,7 @@ instance's greatest bottleneck.
if self.prev_report: if self.prev_report:
self.report.add_custom_value( self.report.add_custom_value(
_get_video_cache_key(element), _get_video_cache_key(element),
self.prev_report.get_custom_value( self.prev_report.get_custom_value(_get_video_cache_key(element))
_get_video_cache_key(element))
) )
# A video might contain other videos, so let's "crawl" the video first # A video might contain other videos, so let's "crawl" the video first
@ -544,8 +532,7 @@ instance's greatest bottleneck.
# Mark all existing videos as known to ensure they do not get deleted during cleanup. # Mark all existing videos as known to ensure they do not get deleted during cleanup.
# We "downloaded" them, just without actually making a network request as we assumed # We "downloaded" them, just without actually making a network request as we assumed
# they did not change. # they did not change.
contained = self._previous_contained_opencast_videos( contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
element, maybe_dl.path)
if len(contained) > 1: if len(contained) > 1:
# Only do this if we threw away the original dl token, # Only do this if we threw away the original dl token,
# to not download single-stream videos twice # to not download single-stream videos twice
@ -561,8 +548,7 @@ instance's greatest bottleneck.
) -> List[PurePath]: ) -> List[PurePath]:
if not self.prev_report: if not self.prev_report:
return [] return []
custom_value = self.prev_report.get_custom_value( custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
_get_video_cache_key(element))
if not custom_value: if not custom_value:
return [] return []
cached_value = cast(dict[str, Any], custom_value) cached_value = cast(dict[str, Any], custom_value)
@ -584,11 +570,9 @@ instance's greatest bottleneck.
f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}" f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
) )
if all(self._output_dir.resolve(path).exists() for path in contained_videos): if all(self._output_dir.resolve(path).exists() for path in contained_videos):
log.explain( log.explain("Found all known videos locally, skipping enumeration request")
"Found all known videos locally, skipping enumeration request")
return True return True
log.explain( log.explain("Missing at least one video, continuing with requests!")
"Missing at least one video, continuing with requests!")
else: else:
log.explain("No local cache present") log.explain("No local cache present")
return False return False
@ -599,8 +583,7 @@ instance's greatest bottleneck.
def add_to_report(paths: list[str]) -> None: def add_to_report(paths: list[str]) -> None:
self.report.add_custom_value( self.report.add_custom_value(
_get_video_cache_key(element), _get_video_cache_key(element),
{"known_paths": paths, "own_path": str( {"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
self._transformer.transform(dl.path))}
) )
async with dl as (bar, sink): async with dl as (bar, sink):
@ -628,8 +611,7 @@ instance's greatest bottleneck.
continue continue
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
log.explain(f"Streaming video from real url {stream_element.url}") log.explain(f"Streaming video from real url {stream_element.url}")
contained_video_paths.append( contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
str(self._transformer.transform(maybe_dl.path)))
await self._stream_from_url(stream_element.url, sink, bar, is_video=True) await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
add_to_report(contained_video_paths) add_to_report(contained_video_paths)
@ -749,8 +731,7 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for elem in elements: for elem in elements:
tasks.append(asyncio.create_task( tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
self._download_forum_thread(cl.path, elem)))
# And execute them # And execute them
await self.gather(tasks) await self.gather(tasks)
@ -811,11 +792,9 @@ instance's greatest bottleneck.
tasks: List[Awaitable[None]] = [] tasks: List[Awaitable[None]] = []
for index, elem in enumerate(elements): for index, elem in enumerate(elements):
prev_url = elements[index - 1].title if index > 0 else None prev_url = elements[index - 1].title if index > 0 else None
next_url = elements[index + next_url = elements[index + 1].title if index < len(elements) - 1 else None
1].title if index < len(elements) - 1 else None
tasks.append(asyncio.create_task( tasks.append(asyncio.create_task(
self._download_learning_module_page( self._download_learning_module_page(cl.path, elem, prev_url, next_url)
cl.path, elem, prev_url, next_url)
)) ))
# And execute them # And execute them
@ -868,15 +847,13 @@ instance's greatest bottleneck.
return return
if prev: if prev:
prev_p = self._transformer.transform( prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
parent_path / (_sanitize_path_name(prev) + ".html"))
if prev_p: if prev_p:
prev = os.path.relpath(prev_p, my_path.parent) prev = os.path.relpath(prev_p, my_path.parent)
else: else:
prev = None prev = None
if next: if next:
next_p = self._transformer.transform( next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
parent_path / (_sanitize_path_name(next) + ".html"))
if next_p: if next_p:
next = os.path.relpath(next_p, my_path.parent) next = os.path.relpath(next_p, my_path.parent)
else: else:
@ -885,8 +862,7 @@ instance's greatest bottleneck.
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
content = element.content content = element.content
content = await self.internalize_images(content) content = await self.internalize_images(content)
sink.file.write(learning_module_template( sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
content, maybe_dl.path.name, prev, next).encode("utf-8"))
sink.done() sink.done()
async def internalize_images(self, tag: Tag) -> Tag: async def internalize_images(self, tag: Tag) -> Tag:
@ -904,8 +880,7 @@ instance's greatest bottleneck.
continue continue
log.explain(f"Internalizing {url!r}") log.explain(f"Internalizing {url!r}")
img = await self._get_authenticated(url) img = await self._get_authenticated(url)
elem.attrs["src"] = "data:;base64," + \ elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
base64.b64encode(img).decode()
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
# For unknown reasons the protocol seems to be stripped. # For unknown reasons the protocol seems to be stripped.
elem.attrs["src"] = "https:" + elem.attrs["src"] elem.attrs["src"] = "https:" + elem.attrs["src"]
@ -935,8 +910,7 @@ instance's greatest bottleneck.
soup = soupify(await request.read()) soup = soupify(await request.read())
if IliasPage.is_logged_in(soup): if IliasPage.is_logged_in(soup):
return self._verify_page(soup, url, root_page_allowed) return self._verify_page(soup, url, root_page_allowed)
raise CrawlError( raise CrawlError(f"get_page failed even after authenticating on {url!r}")
f"get_page failed even after authenticating on {url!r}")
@staticmethod @staticmethod
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
@ -1007,13 +981,11 @@ instance's greatest bottleneck.
login_form = login_page.find("form", attrs={"name": "formlogin"}) login_form = login_page.find("form", attrs={"name": "formlogin"})
if login_form is None: if login_form is None:
raise CrawlError( raise CrawlError("Could not find the login form! Specified client id might be invalid.")
"Could not find the login form! Specified client id might be invalid.")
login_url = login_form.attrs.get("action") login_url = login_form.attrs.get("action")
if login_url is None: if login_url is None:
raise CrawlError( raise CrawlError("Could not find the action URL in the login form!")
"Could not find the action URL in the login form!")
username, password = await self._auth.credentials() username, password = await self._auth.credentials()
@ -1034,8 +1006,7 @@ instance's greatest bottleneck.
# Normal ILIAS pages # Normal ILIAS pages
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
if mainbar is not None: if mainbar is not None:
login_button = mainbar.find( login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
attrs={"href": lambda x: x and "login.php" in x})
shib_login = soup.find(id="button_shib_login") shib_login = soup.find(id="button_shib_login")
return not login_button and not shib_login return not login_button and not shib_login

View file

@ -48,8 +48,7 @@ class ShibbolethLogin:
while not self._login_successful(soup): while not self._login_successful(soup):
# Searching the form here so that this fails before asking for # Searching the form here so that this fails before asking for
# credentials rather than after asking. # credentials rather than after asking.
form = soup.find( form = soup.find("form", {"method": "post"})
"form", {"method": "post"})
action = form["action"] action = form["action"]
# Equivalent: Enter credentials in # Equivalent: Enter credentials in