mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
Fix line wrapping
This commit is contained in:
parent
27667822eb
commit
998abc3afd
2 changed files with 30 additions and 60 deletions
|
|
@ -53,8 +53,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||||
self.missing_value("client_id")
|
self.missing_value("client_id")
|
||||||
return LoginTypeLocal(client_id)
|
return LoginTypeLocal(client_id)
|
||||||
|
|
||||||
self.invalid_value("login_type", login_type,
|
self.invalid_value("login_type", login_type, "Should be <shibboleth | local>")
|
||||||
"Should be <shibboleth | local>")
|
|
||||||
|
|
||||||
def tfa_auth(
|
def tfa_auth(
|
||||||
self, authenticators: Dict[str, Authenticator]
|
self, authenticators: Dict[str, Authenticator]
|
||||||
|
|
@ -64,8 +63,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||||
return None
|
return None
|
||||||
auth = authenticators.get(value)
|
auth = authenticators.get(value)
|
||||||
if auth is None:
|
if auth is None:
|
||||||
self.invalid_value("tfa_auth", value,
|
self.invalid_value("tfa_auth", value, "No such auth section exists")
|
||||||
"No such auth section exists")
|
|
||||||
return auth
|
return auth
|
||||||
|
|
||||||
def target(self) -> TargetType:
|
def target(self) -> TargetType:
|
||||||
|
|
@ -83,8 +81,7 @@ class IliasWebCrawlerSection(HttpCrawlerSection):
|
||||||
# URL
|
# URL
|
||||||
return target
|
return target
|
||||||
|
|
||||||
self.invalid_value(
|
self.invalid_value("target", target, "Should be <course id | desktop | kit ilias URL>")
|
||||||
"target", target, "Should be <course id | desktop | kit ilias URL>")
|
|
||||||
|
|
||||||
def links(self) -> Links:
|
def links(self) -> Links:
|
||||||
type_str: Optional[str] = self.s.get("links")
|
type_str: Optional[str] = self.s.get("links")
|
||||||
|
|
@ -200,8 +197,7 @@ instance's greatest bottleneck.
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
if isinstance(self._target, int):
|
if isinstance(self._target, int):
|
||||||
log.explain_topic(
|
log.explain_topic(f"Inferred crawl target: Course with id {self._target}")
|
||||||
f"Inferred crawl target: Course with id {self._target}")
|
|
||||||
await self._crawl_course(self._target)
|
await self._crawl_course(self._target)
|
||||||
elif self._target == "desktop":
|
elif self._target == "desktop":
|
||||||
log.explain_topic("Inferred crawl target: Personal desktop")
|
log.explain_topic("Inferred crawl target: Personal desktop")
|
||||||
|
|
@ -400,8 +396,7 @@ instance's greatest bottleneck.
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
log.explain("Answer: Yes")
|
log.explain("Answer: Yes")
|
||||||
element_path = element_path.with_name(
|
element_path = element_path.with_name(element_path.name + link_extension)
|
||||||
element_path.name + link_extension)
|
|
||||||
|
|
||||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
|
|
@ -413,11 +408,9 @@ instance's greatest bottleneck.
|
||||||
@_iorepeat(3, "resolving link")
|
@_iorepeat(3, "resolving link")
|
||||||
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
|
async def _download_link(self, element: IliasPageElement, link_template: str, dl: DownloadToken) -> None:
|
||||||
async with dl as (bar, sink):
|
async with dl as (bar, sink):
|
||||||
export_url = element.url.replace(
|
export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML")
|
||||||
"cmd=calldirectlink", "cmd=exportHTML")
|
|
||||||
real_url = await self._resolve_link_target(export_url)
|
real_url = await self._resolve_link_target(export_url)
|
||||||
self._write_link_content(
|
self._write_link_content(link_template, real_url, element.name, element.description, sink)
|
||||||
link_template, real_url, element.name, element.description, sink)
|
|
||||||
|
|
||||||
def _write_link_content(
|
def _write_link_content(
|
||||||
self,
|
self,
|
||||||
|
|
@ -431,8 +424,7 @@ instance's greatest bottleneck.
|
||||||
content = content.replace("{{link}}", url)
|
content = content.replace("{{link}}", url)
|
||||||
content = content.replace("{{name}}", name)
|
content = content.replace("{{name}}", name)
|
||||||
content = content.replace("{{description}}", str(description))
|
content = content.replace("{{description}}", str(description))
|
||||||
content = content.replace(
|
content = content.replace("{{redirect_delay}}", str(self._link_file_redirect_delay))
|
||||||
"{{redirect_delay}}", str(self._link_file_redirect_delay))
|
|
||||||
sink.file.write(content.encode("utf-8"))
|
sink.file.write(content.encode("utf-8"))
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
|
|
@ -451,8 +443,7 @@ instance's greatest bottleneck.
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
log.explain("Answer: Yes")
|
log.explain("Answer: Yes")
|
||||||
element_path = element_path.with_name(
|
element_path = element_path.with_name(element_path.name + link_extension)
|
||||||
element_path.name + link_extension)
|
|
||||||
|
|
||||||
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
maybe_dl = await self.download(element_path, mtime=element.mtime)
|
||||||
if not maybe_dl:
|
if not maybe_dl:
|
||||||
|
|
@ -485,8 +476,7 @@ instance's greatest bottleneck.
|
||||||
dl: DownloadToken,
|
dl: DownloadToken,
|
||||||
) -> None:
|
) -> None:
|
||||||
async with dl as (bar, sink):
|
async with dl as (bar, sink):
|
||||||
self._write_link_content(
|
self._write_link_content(link_template, element.url, element.name, element.description, sink)
|
||||||
link_template, element.url, element.name, element.description, sink)
|
|
||||||
|
|
||||||
async def _resolve_link_target(self, export_url: str) -> str:
|
async def _resolve_link_target(self, export_url: str) -> str:
|
||||||
async def impl() -> Optional[str]:
|
async def impl() -> Optional[str]:
|
||||||
|
|
@ -510,8 +500,7 @@ instance's greatest bottleneck.
|
||||||
if target is not None:
|
if target is not None:
|
||||||
return target
|
return target
|
||||||
|
|
||||||
raise CrawlError(
|
raise CrawlError("resolve_link_target failed even after authenticating")
|
||||||
"resolve_link_target failed even after authenticating")
|
|
||||||
|
|
||||||
async def _handle_opencast_video(
|
async def _handle_opencast_video(
|
||||||
self,
|
self,
|
||||||
|
|
@ -522,8 +511,7 @@ instance's greatest bottleneck.
|
||||||
if self.prev_report:
|
if self.prev_report:
|
||||||
self.report.add_custom_value(
|
self.report.add_custom_value(
|
||||||
_get_video_cache_key(element),
|
_get_video_cache_key(element),
|
||||||
self.prev_report.get_custom_value(
|
self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||||
_get_video_cache_key(element))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# A video might contain other videos, so let's "crawl" the video first
|
# A video might contain other videos, so let's "crawl" the video first
|
||||||
|
|
@ -544,8 +532,7 @@ instance's greatest bottleneck.
|
||||||
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
|
# Mark all existing videos as known to ensure they do not get deleted during cleanup.
|
||||||
# We "downloaded" them, just without actually making a network request as we assumed
|
# We "downloaded" them, just without actually making a network request as we assumed
|
||||||
# they did not change.
|
# they did not change.
|
||||||
contained = self._previous_contained_opencast_videos(
|
contained = self._previous_contained_opencast_videos(element, maybe_dl.path)
|
||||||
element, maybe_dl.path)
|
|
||||||
if len(contained) > 1:
|
if len(contained) > 1:
|
||||||
# Only do this if we threw away the original dl token,
|
# Only do this if we threw away the original dl token,
|
||||||
# to not download single-stream videos twice
|
# to not download single-stream videos twice
|
||||||
|
|
@ -561,8 +548,7 @@ instance's greatest bottleneck.
|
||||||
) -> List[PurePath]:
|
) -> List[PurePath]:
|
||||||
if not self.prev_report:
|
if not self.prev_report:
|
||||||
return []
|
return []
|
||||||
custom_value = self.prev_report.get_custom_value(
|
custom_value = self.prev_report.get_custom_value(_get_video_cache_key(element))
|
||||||
_get_video_cache_key(element))
|
|
||||||
if not custom_value:
|
if not custom_value:
|
||||||
return []
|
return []
|
||||||
cached_value = cast(dict[str, Any], custom_value)
|
cached_value = cast(dict[str, Any], custom_value)
|
||||||
|
|
@ -584,11 +570,9 @@ instance's greatest bottleneck.
|
||||||
f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
|
f"The following contained videos are known: {','.join(map(fmt_path, contained_videos))}"
|
||||||
)
|
)
|
||||||
if all(self._output_dir.resolve(path).exists() for path in contained_videos):
|
if all(self._output_dir.resolve(path).exists() for path in contained_videos):
|
||||||
log.explain(
|
log.explain("Found all known videos locally, skipping enumeration request")
|
||||||
"Found all known videos locally, skipping enumeration request")
|
|
||||||
return True
|
return True
|
||||||
log.explain(
|
log.explain("Missing at least one video, continuing with requests!")
|
||||||
"Missing at least one video, continuing with requests!")
|
|
||||||
else:
|
else:
|
||||||
log.explain("No local cache present")
|
log.explain("No local cache present")
|
||||||
return False
|
return False
|
||||||
|
|
@ -599,8 +583,7 @@ instance's greatest bottleneck.
|
||||||
def add_to_report(paths: list[str]) -> None:
|
def add_to_report(paths: list[str]) -> None:
|
||||||
self.report.add_custom_value(
|
self.report.add_custom_value(
|
||||||
_get_video_cache_key(element),
|
_get_video_cache_key(element),
|
||||||
{"known_paths": paths, "own_path": str(
|
{"known_paths": paths, "own_path": str(self._transformer.transform(dl.path))}
|
||||||
self._transformer.transform(dl.path))}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async with dl as (bar, sink):
|
async with dl as (bar, sink):
|
||||||
|
|
@ -628,8 +611,7 @@ instance's greatest bottleneck.
|
||||||
continue
|
continue
|
||||||
async with maybe_dl as (bar, sink):
|
async with maybe_dl as (bar, sink):
|
||||||
log.explain(f"Streaming video from real url {stream_element.url}")
|
log.explain(f"Streaming video from real url {stream_element.url}")
|
||||||
contained_video_paths.append(
|
contained_video_paths.append(str(self._transformer.transform(maybe_dl.path)))
|
||||||
str(self._transformer.transform(maybe_dl.path)))
|
|
||||||
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
|
await self._stream_from_url(stream_element.url, sink, bar, is_video=True)
|
||||||
|
|
||||||
add_to_report(contained_video_paths)
|
add_to_report(contained_video_paths)
|
||||||
|
|
@ -749,8 +731,7 @@ instance's greatest bottleneck.
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
for elem in elements:
|
for elem in elements:
|
||||||
tasks.append(asyncio.create_task(
|
tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem)))
|
||||||
self._download_forum_thread(cl.path, elem)))
|
|
||||||
|
|
||||||
# And execute them
|
# And execute them
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
|
@ -811,11 +792,9 @@ instance's greatest bottleneck.
|
||||||
tasks: List[Awaitable[None]] = []
|
tasks: List[Awaitable[None]] = []
|
||||||
for index, elem in enumerate(elements):
|
for index, elem in enumerate(elements):
|
||||||
prev_url = elements[index - 1].title if index > 0 else None
|
prev_url = elements[index - 1].title if index > 0 else None
|
||||||
next_url = elements[index +
|
next_url = elements[index + 1].title if index < len(elements) - 1 else None
|
||||||
1].title if index < len(elements) - 1 else None
|
|
||||||
tasks.append(asyncio.create_task(
|
tasks.append(asyncio.create_task(
|
||||||
self._download_learning_module_page(
|
self._download_learning_module_page(cl.path, elem, prev_url, next_url)
|
||||||
cl.path, elem, prev_url, next_url)
|
|
||||||
))
|
))
|
||||||
|
|
||||||
# And execute them
|
# And execute them
|
||||||
|
|
@ -868,15 +847,13 @@ instance's greatest bottleneck.
|
||||||
return
|
return
|
||||||
|
|
||||||
if prev:
|
if prev:
|
||||||
prev_p = self._transformer.transform(
|
prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html"))
|
||||||
parent_path / (_sanitize_path_name(prev) + ".html"))
|
|
||||||
if prev_p:
|
if prev_p:
|
||||||
prev = os.path.relpath(prev_p, my_path.parent)
|
prev = os.path.relpath(prev_p, my_path.parent)
|
||||||
else:
|
else:
|
||||||
prev = None
|
prev = None
|
||||||
if next:
|
if next:
|
||||||
next_p = self._transformer.transform(
|
next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html"))
|
||||||
parent_path / (_sanitize_path_name(next) + ".html"))
|
|
||||||
if next_p:
|
if next_p:
|
||||||
next = os.path.relpath(next_p, my_path.parent)
|
next = os.path.relpath(next_p, my_path.parent)
|
||||||
else:
|
else:
|
||||||
|
|
@ -885,8 +862,7 @@ instance's greatest bottleneck.
|
||||||
async with maybe_dl as (bar, sink):
|
async with maybe_dl as (bar, sink):
|
||||||
content = element.content
|
content = element.content
|
||||||
content = await self.internalize_images(content)
|
content = await self.internalize_images(content)
|
||||||
sink.file.write(learning_module_template(
|
sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
||||||
content, maybe_dl.path.name, prev, next).encode("utf-8"))
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def internalize_images(self, tag: Tag) -> Tag:
|
async def internalize_images(self, tag: Tag) -> Tag:
|
||||||
|
|
@ -904,8 +880,7 @@ instance's greatest bottleneck.
|
||||||
continue
|
continue
|
||||||
log.explain(f"Internalizing {url!r}")
|
log.explain(f"Internalizing {url!r}")
|
||||||
img = await self._get_authenticated(url)
|
img = await self._get_authenticated(url)
|
||||||
elem.attrs["src"] = "data:;base64," + \
|
elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode()
|
||||||
base64.b64encode(img).decode()
|
|
||||||
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
|
if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"):
|
||||||
# For unknown reasons the protocol seems to be stripped.
|
# For unknown reasons the protocol seems to be stripped.
|
||||||
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
elem.attrs["src"] = "https:" + elem.attrs["src"]
|
||||||
|
|
@ -935,8 +910,7 @@ instance's greatest bottleneck.
|
||||||
soup = soupify(await request.read())
|
soup = soupify(await request.read())
|
||||||
if IliasPage.is_logged_in(soup):
|
if IliasPage.is_logged_in(soup):
|
||||||
return self._verify_page(soup, url, root_page_allowed)
|
return self._verify_page(soup, url, root_page_allowed)
|
||||||
raise CrawlError(
|
raise CrawlError(f"get_page failed even after authenticating on {url!r}")
|
||||||
f"get_page failed even after authenticating on {url!r}")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
def _verify_page(soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup:
|
||||||
|
|
@ -1007,13 +981,11 @@ instance's greatest bottleneck.
|
||||||
|
|
||||||
login_form = login_page.find("form", attrs={"name": "formlogin"})
|
login_form = login_page.find("form", attrs={"name": "formlogin"})
|
||||||
if login_form is None:
|
if login_form is None:
|
||||||
raise CrawlError(
|
raise CrawlError("Could not find the login form! Specified client id might be invalid.")
|
||||||
"Could not find the login form! Specified client id might be invalid.")
|
|
||||||
|
|
||||||
login_url = login_form.attrs.get("action")
|
login_url = login_form.attrs.get("action")
|
||||||
if login_url is None:
|
if login_url is None:
|
||||||
raise CrawlError(
|
raise CrawlError("Could not find the action URL in the login form!")
|
||||||
"Could not find the action URL in the login form!")
|
|
||||||
|
|
||||||
username, password = await self._auth.credentials()
|
username, password = await self._auth.credentials()
|
||||||
|
|
||||||
|
|
@ -1034,8 +1006,7 @@ instance's greatest bottleneck.
|
||||||
# Normal ILIAS pages
|
# Normal ILIAS pages
|
||||||
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
|
mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar")
|
||||||
if mainbar is not None:
|
if mainbar is not None:
|
||||||
login_button = mainbar.find(
|
login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x})
|
||||||
attrs={"href": lambda x: x and "login.php" in x})
|
|
||||||
shib_login = soup.find(id="button_shib_login")
|
shib_login = soup.find(id="button_shib_login")
|
||||||
return not login_button and not shib_login
|
return not login_button and not shib_login
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,8 +48,7 @@ class ShibbolethLogin:
|
||||||
while not self._login_successful(soup):
|
while not self._login_successful(soup):
|
||||||
# Searching the form here so that this fails before asking for
|
# Searching the form here so that this fails before asking for
|
||||||
# credentials rather than after asking.
|
# credentials rather than after asking.
|
||||||
form = soup.find(
|
form = soup.find("form", {"method": "post"})
|
||||||
"form", {"method": "post"})
|
|
||||||
action = form["action"]
|
action = form["action"]
|
||||||
|
|
||||||
# Equivalent: Enter credentials in
|
# Equivalent: Enter credentials in
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue