mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 23:45:05 +02:00
Fix line wrapping
This commit is contained in:
parent
27667822eb
commit
998abc3afd
2 changed files with 30 additions and 60 deletions
|
|
@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
|||
self.missing_value("client_id")
|
||||
return LoginTypeLocal(client_id)
|
||||
|
||||
self.invalid_value("login_type", login_type,
|
||||
"Should be <shibboleth | local>")
|
||||
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
|
||||
|
||||
def tfa_auth(
|
||||
self, authenticators: Dict[str, Authenticator]
|
||||
|
|
@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
|||
return None
|
||||
auth = authenticators.get(value)
|
||||
if auth is None:
|
||||
self.invalid_value("tfa_auth", value,
|
||||
"No such auth section exists")
|
||||
self.invalid_value("tfa_auth", value, "No such auth section exists")
|
||||
return auth
|
||||
|
||||
def target(self) -> TargetType:
|
||||
|
|
@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
|||
# URL
|
||||
return target
|
||||
|
||||
self.invalid_value(
|
||||
"target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||
|
||||
def links(self) -> Links:
|
||||
type_str: Optional[str] = self.s.get("links")
|
||||
|
|
@ -200,8 +197,7 @@ instance's greatest bottleneck.
|
|||
|
||||
async def _run(self) -> None:
|
||||
if isinstance(self._target, int):
|
||||
log.explain_topic(
|
||||
f"Inferred crawl target: Course with id {self._target}")
|
||||
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
|
||||
await self._crawl_course(self._target)
|
||||
elif self._target == "desktop":
|
||||
log.explain_topic("Inferred crawl target: Personal desktop")
|
||||
|
|
@ -400,8 +396,7 @@ instance's greatest bottleneck.
|
|||
return None
|
||||
else:
|
||||
log.explain("Answer: Yes")
|
||||
element_path = element_path.with_name(
|
||||
element_path.name + link_extension)
|
||||
element_path = element_path.with_name(element_path.name + link_extension)
|
||||
|
||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||
if not maybe_dl:
|
||||
|
|
@ -413,11 +408,9 @@ instance's greatest bottleneck.
|
|||
@_iorepeat(3, "resolving link")
|
||||
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
|
||||
async with dl as (bar, sink):
|
||||
export_url = element.url.replace(
|
||||
"cmd=calldirectlink", "cmd=exportHTML")
|
||||
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
|
||||
real_url = await self._resolve_link_target(export_url)
|
||||
self._write_link_content(
|
||||
link_template, real_url, element.name, element.description, sink)
|
||||
self._write_link_content(link_template, real_url, element.name, element.description, sink)
|
||||
|
||||
def _write_link_content(
|
||||
self,
|
||||
|
|
@ -431,8 +424,7 @@ instance's greatest bottleneck.
|
|||
content = content.replace("{{link}}", url)
|
||||
content = content.replace("{{name}}", name)
|
||||
content = content.replace("{{description}}", str(description))
|
||||
content = content.replace(
|
||||
"{{redirect_delay}}", str(self._link_file_redirect_delay))
|
||||
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
|
||||
sink.file.write(content.encode("utf-8"))
|
||||
sink.done()
|
||||
|
||||
|
|
@ -451,8 +443,7 @@ instance's greatest bottleneck.
|
|||
return None
|
||||
else:
|
||||
log.explain("Answer: Yes")
|
||||
element_path = element_path.with_name(
|
||||
element_path.name + link_extension)
|
||||
element_path = element_path.with_name(element_path.name + link_extension)
|
||||
|
||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||
if not maybe_dl:
|
||||
|
|
@ -485,8 +476,7 @@ instance's greatest bottleneck.
|
|||
dl: DownloadToken,
|
||||
) -> None:
|
||||
async with dl as (bar, sink):
|
||||
self._write_link_content(
|
||||
link_template, element.url, element.name, element.description, sink)
|
||||
self._write_link_content(link_template, element.url, element.name, element.description, sink)
|
||||
|
||||
async def _resolve_link_target(self, export_url: str) -> str:
|
||||
async def impl() -> Optional[str]:
|
||||
|
|
@ -510,8 +500,7 @@ instance's greatest bottleneck.
|
|||
if target is not None:
|
||||
return target
|
||||
|
||||
raise CrawlError(
|
||||
"resolve_link_target failed even after authenticating")
|
||||
raise CrawlError("resolve_link_target failed even after authenticating")
|
||||
|
||||
async def _handle_opencast_video(
|
||||
self,
|
||||
|
|
@ -522,8 +511,7 @@ instance's greatest bottleneck.
|
|||
if self.prev_report:
|
||||
self.report.add_custom_value(
|
||||
_get_video_cache_key(element),
|
||||
self.prev_report.get_custom_value(
|
||||
_get_video_cache_key(element))
|
||||
self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||
)
|
||||
|
||||
# A video might contain other videos, so let's "crawl" the video first
|
||||
|
|
@ -544,8 +532,7 @@ instance's greatest bottleneck.
|
|||
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
|
||||
# We "downloaded" them, just without actually making a network request as we assumed
|
||||
# they did not change.
|
||||
contained = self._previous_contained_opencast_videos(
|
||||
element, maybe_dl.path)
|
||||
contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
|
||||
if len(contained) > 1:
|
||||
# Only do this if we threw away the original dl token,
|
||||
# to not download single-stream videos twice
|
||||
|
|
@ -561,8 +548,7 @@ instance's greatest bottleneck.
|
|||
) -> List[PurePath]:
|
||||
if not self.prev_report:
|
||||
return []
|
||||
custom_value = self.prev_report.get_custom_value(
|
||||
_get_video_cache_key(element))
|
||||
custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||
if not custom_value:
|
||||
return []
|
||||
cached_value = cast(dict[str, Any], custom_value)
|
||||
|
|
@ -584,11 +570,9 @@ instance's greatest bottleneck.
|
|||
f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
|
||||
)
|
||||
if all(self._output_dir.resolve(path).exists() for path in contained_videos):
|
||||
log.explain(
|
||||
"Found all known videos locally, skipping enumeration request")
|
||||
log.explain("Found all known videos locally, skipping enumeration request")
|
||||
return True
|
||||
log.explain(
|
||||
"Missing at least one video, continuing with requests!")
|
||||
log.explain("Missing at least one video, continuing with requests!")
|
||||
else:
|
||||
log.explain("No local cache present")
|
||||
return False
|
||||
|
|
@ -599,8 +583,7 @@ instance's greatest bottleneck.
|
|||
def add_to_report(paths: list[str]) -> None:
|
||||
self.report.add_custom_value(
|
||||
_get_video_cache_key(element),
|
||||
{"known_paths": paths, "own_path": str(
|
||||
self._transformer.transform(dl.path))}
|
||||
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
|
||||
)
|
||||
|
||||
async with dl as (bar, sink):
|
||||
|
|
@ -628,8 +611,7 @@ instance's greatest bottleneck.
|
|||
continue
|
||||
async with maybe_dl as (bar, sink):
|
||||
log.explain(f"Streaming video from real url {stream_element.url}")
|
||||
contained_video_paths.append(
|
||||
str(self._transformer.transform(maybe_dl.path)))
|
||||
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
|
||||
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
|
||||
|
||||
add_to_report(contained_video_paths)
|
||||
|
|
@ -749,8 +731,7 @@ instance's greatest bottleneck.
|
|||
|
||||
tasks: List[Awaitable[None]] = []
|
||||
for elem in elements:
|
||||
tasks.append(asyncio.create_task(
|
||||
self._download_forum_thread(cl.path, elem)))
|
||||
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
|
||||
|
||||
# And execute them
|
||||
await self.gather(tasks)
|
||||
|
|
@ -811,11 +792,9 @@ instance's greatest bottleneck.
|
|||
tasks: List[Awaitable[None]] = []
|
||||
for index, elem in enumerate(elements):
|
||||
prev_url = elements[index - 1].title if index > 0 else None
|
||||
next_url = elements[index +
|
||||
1].title if index < len(elements) - 1 else None
|
||||
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
||||
tasks.append(asyncio.create_task(
|
||||
self._download_learning_module_page(
|
||||
cl.path, elem, prev_url, next_url)
|
||||
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
|
||||
))
|
||||
|
||||
# And execute them
|
||||
|
|
@ -868,15 +847,13 @@ instance's greatest bottleneck.
|
|||
return
|
||||
|
||||
if prev:
|
||||
prev_p = self._transformer.transform(
|
||||
parent_path / (_sanitize_path_name(prev) + ".html"))
|
||||
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
||||
if prev_p:
|
||||
prev = os.path.relpath(prev_p, my_path.parent)
|
||||
else:
|
||||
prev = None
|
||||
if next:
|
||||
next_p = self._transformer.transform(
|
||||
parent_path / (_sanitize_path_name(next) + ".html"))
|
||||
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
||||
if next_p:
|
||||
next = os.path.relpath(next_p, my_path.parent)
|
||||
else:
|
||||
|
|
@ -885,8 +862,7 @@ instance's greatest bottleneck.
|
|||
async with maybe_dl as (bar, sink):
|
||||
content = element.content
|
||||
content = await self.internalize_images(content)
|
||||
sink.file.write(learning_module_template(
|
||||
content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
||||
sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
||||
sink.done()
|
||||
|
||||
async def internalize_images(self, tag: Tag) -> Tag:
|
||||
|
|
@ -904,8 +880,7 @@ instance's greatest bottleneck.
|
|||
continue
|
||||
log.explain(f"Internalizing {url!r}")
|
||||
img = await self._get_authenticated(url)
|
||||
elem.attrs["src"] = "data:;base64," + \
|
||||
base64.b64encode(img).decode()
|
||||
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
||||
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
|
||||
# For unknown reasons the protocol seems to be stripped.
|
||||
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
||||
|
|
@ -935,8 +910,7 @@ instance's greatest bottleneck.
|
|||
soup = soupify(await request.read())
|
||||
if IliasPage.is_logged_in(soup):
|
||||
return self._verify_page(soup, url, root_page_allowed)
|
||||
raise CrawlError(
|
||||
f"get_page failed even after authenticating on {url!r}")
|
||||
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
|
||||
|
||||
@staticmethod
|
||||
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
||||
|
|
@ -1007,13 +981,11 @@ instance's greatest bottleneck.
|
|||
|
||||
login_form = login_page.find("form", attrs={"name": "formlogin"})
|
||||
if login_form is None:
|
||||
raise CrawlError(
|
||||
"Could not find the login form! Specified client id might be invalid.")
|
||||
raise CrawlError("Could not find the login form! Specified client id might be invalid.")
|
||||
|
||||
login_url = login_form.attrs.get("action")
|
||||
if login_url is None:
|
||||
raise CrawlError(
|
||||
"Could not find the action URL in the login form!")
|
||||
raise CrawlError("Could not find the action URL in the login form!")
|
||||
|
||||
username, password = await self._auth.credentials()
|
||||
|
||||
|
|
@ -1034,8 +1006,7 @@ instance's greatest bottleneck.
|
|||
# Normal ILIAS pages
|
||||
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
|
||||
if mainbar is not None:
|
||||
login_button = mainbar.find(
|
||||
attrs={"href": lambda x: x and "login.php" in x})
|
||||
login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
|
||||
shib_login = soup.find(id="button_shib_login")
|
||||
return not login_button and not shib_login
|
||||
|
||||
|
|
|
|||
|
|
@ -48,8 +48,7 @@ class ShibbolethLogin:
|
|||
while not self._login_successful(soup):
|
||||
# Searching the form here so that this fails before asking for
|
||||
# credentials rather than after asking.
|
||||
form = soup.find(
|
||||
"form", {"method": "post"})
|
||||
form = soup.find("form", {"method": "post"})
|
||||
action = form["action"]
|
||||
|
||||
# Equivalent: Enter credentials in
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue