From e42ab83d32ce852eb26e1a21982399e2988e769a Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 25 Oct 2021 11:07:25 +0200 Subject: [PATCH] Add support for ILIAS cards --- PFERD/crawl/ilias/kit_ilias_html.py | 94 ++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 78ae084..d8c347d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -368,6 +368,8 @@ class IliasPage: log.explain(f"Found {element_name!r}") result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) + result += self._find_cards() + return result def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: @@ -450,6 +452,90 @@ class IliasPage: log.explain(f"Found file {full_path!r}") return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date) + def _find_cards(self) -> List[IliasPageElement]: + result: List[IliasPageElement] = [] + + card_titles: List[Tag] = self._soup.select(".card-title a") + + for title in card_titles: + url = self._abs_url_from_link(title) + name = _sanitize_path_name(title.getText().strip()) + type = self._find_type_from_card(title) + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {title}") + continue + + result.append(IliasPageElement(type, url, name)) + + card_button_tiles: List[Tag] = self._soup.select(".card-title button") + + for button in card_button_tiles: + regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") + res = regex.search(str(self._soup)) + if not res: + _unexpected_html_warning() + log.warn_contd(f"Could not find click handler target for {button}") + continue + url = self._abs_url_from_relative(res.group(1)) + name = _sanitize_path_name(button.getText().strip()) + type = self._find_type_from_card(button) + caption_parent = button.findParent( + "div", + attrs={"class": lambda x: x and "caption" in x}, + ) + description = caption_parent.find_next_sibling("div").getText().strip() + + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {button}") + continue + + result.append(IliasPageElement(type, url, name, description=description)) + + return result + + def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: + def is_card_root(element: Tag) -> bool: + return "il-card" in element["class"] and "thumbnail" in element["class"] + + card_root: Optional[Tag] = None + + # We look for the card root + for parent in card_title.parents: + if is_card_root(parent): + card_root = parent + break + + if card_root is None: + _unexpected_html_warning() + log.warn_contd(f"Tried to figure out element type, but did not find an icon for {card_title}") + return None + + icon: Tag = card_root.select_one(".il-card-repository-head .icon") + + if "opencast" in icon["class"]: + return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED + if "exc" in icon["class"]: + return IliasElementType.EXERCISE + if "webr" in icon["class"]: + return IliasElementType.LINK + if "book" in icon["class"]: + return IliasElementType.BOOKING + if "frm" in icon["class"]: + return IliasElementType.FORUM + if "sess" in icon["class"]: + return IliasElementType.MEETING + if "tst" in icon["class"]: + return IliasElementType.TEST + if "fold" in icon["class"]: + return IliasElementType.FOLDER + + _unexpected_html_warning() + log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") + return None + @staticmethod def _find_type_from_link( element_name: str, @@ -550,7 +636,13 @@ class IliasPage: """ Create an absolute url from an tag. """ - return urljoin(self._page_url, link_tag.get("href")) + return self._abs_url_from_relative(link_tag.get("href")) + + def _abs_url_from_relative(self, relative_url: str) -> str: + """ + Create an absolute url from a relative URL. + """ + return urljoin(self._page_url, relative_url) def _unexpected_html_warning() -> None: