diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ece88c5..9c8ab95 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -39,7 +39,12 @@ class IliasPageElement: description: Optional[str] = None def id(self) -> str: - regexes = [r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)"] + regexes = [ + r"eid=(?P[0-9a-z\-]+)", + r"file_(?P\d+)", + r"ref_id=(?P\d+)", + r"target=[a-z]+_(?P\d+)" + ] for regex in regexes: if match := re.search(regex, self.url): @@ -71,6 +76,9 @@ class IliasPage: if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() + if self._is_personal_desktop(): + log.explain("Page is the personal desktop") + return self._find_personal_desktop_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() @@ -115,6 +123,9 @@ class IliasPage: return False + def _is_personal_desktop(self) -> bool: + return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -149,6 +160,26 @@ class IliasPage: return items + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: + items: List[IliasPageElement] = [] + + titles: List[Tag] = self._soup.select(".il-item-title") + for title in titles: + link = title.find("a") + name = _sanitize_path_name(link.text.strip()) + url = self._abs_url_from_link(link) + + type = self._find_type_from_link(name, link, url) + if not type: + _unexpected_html_warning() + log.warn_contd(f"Could not extract type for {link}") + continue + + log.explain(f"Found {name!r}") + items.append(IliasPageElement(type, url, name)) + + return items + def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing @@ -551,9 +582,30 @@ class IliasPage: if "target=file_" in parsed_url.query: return IliasElementType.FILE + if "target=grp_" in parsed_url.query: + return IliasElementType.FOLDER + + if "target=crs_" in parsed_url.query: + return IliasElementType.FOLDER + + if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: + return IliasElementType.EXERCISE + + if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: + return IliasElementType.LINK + + if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: + return IliasElementType.FORUM + + if "cmdClass=ilobjtestgui" in parsed_url.query: + return IliasElementType.TEST + + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so + # try to guess it from the image. + # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc - if "ref_id=" in parsed_url.query: + if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: return IliasPage._find_type_from_folder_like(link_element, url) _unexpected_html_warning() @@ -574,7 +626,7 @@ class IliasPage: # We look for the outer div of our inner link, to find information around it # (mostly the icon) for parent in link_element.parents: - if "ilContainerListItemOuter" in parent["class"]: + if "ilContainerListItemOuter" in parent["class"] or "il-std-item" in parent["class"]: found_parent = parent break @@ -586,6 +638,9 @@ class IliasPage: # Find the small descriptive icon to figure out the type img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") + if img_tag is None: + img_tag = found_parent.select_one("img.icon") + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5d44566..99d6cf6 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -203,7 +203,9 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - await self._crawl_url(self._base_url) + appendix = r"ILIAS\PersonalDesktop\PDMainBarProvider|mm_pd_sel_items" + appendix = appendix.encode("ASCII").hex() + await self._crawl_url(self._base_url + "/gs_content.php?item=" + appendix) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -622,6 +624,11 @@ instance's greatest bottleneck. if mainbar is not None: login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) return not login_button + + # Personal Desktop + if soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}): + return True + # Video listing embeds do not have complete ILIAS html. Try to match them by # their video listing table video_table = soup.find(