diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index e9e332e..1385ba4 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,5 +1,8 @@ -link_template_plain = "{{link}}" -link_template_rich = """ +from enum import Enum +from typing import Optional + +_link_template_plain = "{{link}}" +_link_template_fancy = """ @@ -84,4 +87,35 @@ link_template_rich = """ -""" # noqa: E501 line too long +""".strip() # noqa: E501 line too long + +_link_template_internet_shortcut = """ +[InternetShortcut] +URL={{link}} +""".strip() + + +class Links(Enum): + IGNORE = "ignore" + PLAIN = "plain" + FANCY = "fancy" + INTERNET_SHORTCUT = "internet-shortcut" + + def template(self) -> Optional[str]: + if self == self.FANCY: + return _link_template_fancy + elif self == self.PLAIN: + return _link_template_plain + elif self == self.INTERNET_SHORTCUT: + return _link_template_internet_shortcut + elif self == self.IGNORE: + return None + raise ValueError("Missing switch case") + + @staticmethod + def from_string(string: str) -> "Links": + try: + return Links(string) + except ValueError: + raise ValueError("must be one of 'ignore', 'plain'," + " 'html', 'internet-shortcut'") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 0bd3202..283289e 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -14,7 +14,7 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import CrawlError, CrawlWarning, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import link_template_plain, link_template_rich +from .file_templates import Links from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement, deduplicate_element_names TargetType = Union[str, int] @@ -52,8 +52,16 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def link_file_redirect_delay(self) -> int: return self.s.getint("link_file_redirect_delay", fallback=-1) - def link_file_use_plaintext(self) -> bool: - return self.s.getboolean("link_file_plaintext", fallback=False) + def links(self) -> Links: + type_str: Optional[str] = self.s.get("links") + + if type_str is None: + return Links.FANCY + + try: + return Links.from_string(type_str) + except ValueError as e: + self.invalid_value("links", type_str, str(e).capitalize()) def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) @@ -166,7 +174,7 @@ class KitIliasWebCrawler(HttpCrawler): self._target = section.target() self._link_file_redirect_delay = section.link_file_redirect_delay() - self._link_file_use_plaintext = section.link_file_use_plaintext() + self._links = section.links() self._videos = section.videos() async def _run(self) -> None: @@ -292,6 +300,17 @@ class KitIliasWebCrawler(HttpCrawler): raise CrawlWarning(f"Unknown element type: {element.type!r}") async def _download_link(self, element: IliasPageElement, element_path: PurePath) -> None: + log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}") + log.explain(f"Links type is {self._links}") + + link_template_maybe = self._links.template() + if not link_template_maybe: + log.explain("Answer: No") + return + else: + log.explain("Answer: Yes") + link_template = link_template_maybe + maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return @@ -303,7 +322,7 @@ class KitIliasWebCrawler(HttpCrawler): export_url = element.url.replace("cmd=calldirectlink", "cmd=exportHTML") real_url = await self._resolve_link_target(export_url) - content = link_template_plain if self._link_file_use_plaintext else link_template_rich + content = link_template content = content.replace("{{link}}", real_url) content = content.replace("{{name}}", element.name) content = content.replace("{{description}}", str(element.description))