From b8fe25c580a8cafc14c32890f0635c7daecafc4d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 4 May 2022 14:13:39 +0200 Subject: [PATCH 001/147] Add `.cpp` to ipd link regex --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 310059a..22fdd29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Add `.cpp` to IPD link regex + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 1a5314b..e5ec58f 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|java)$") + regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") return re.compile(regex) From afbd03f7774a1c0f22c471d98f995153bb08edcd Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:15:48 +0200 Subject: [PATCH 002/147] Fix docs --- CHANGELOG.md | 2 +- CONFIG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22fdd29..f5af29d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ ambiguous situations. ## Unreleased ### Changed -- Add `.cpp` to IPD link regex +- Add `cpp` extension to default `link_regex` of IPD crawler ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 569780d..1355c34 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|java)$`) + files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler From bc3fa36637b5a4f4ea26db1a9437e4cbd5cad5c4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:20:45 +0200 Subject: [PATCH 003/147] Fix IPD crawler crashing on weird HTML comments --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5af29d..de7b795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,9 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +### Fixed +- IPD crawler crashes on some sites + ## 3.4.0 - 2022-05-01 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index e5ec58f..58e71f8 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler): async def get_page(self) -> BeautifulSoup: async with self.session.get(self._url) as request: - return soupify(await request.read()) + # The web page for Algorithmen für Routenplanung contains some + # weird comments that beautifulsoup doesn't parse correctly. This + # hack enables those pages to be crawled, and should hopefully not + # cause issues on other pages. + content = (await request.read()).decode("utf-8") + content = re.sub(r"", "", content) + return soupify(content.encode("utf-8")) From af2cc1169ace7154349518f7f709023eeb76ba95 Mon Sep 17 00:00:00 2001 From: Joscha Date: Thu, 5 May 2022 14:23:19 +0200 Subject: [PATCH 004/147] Mention href for users of link_regex option --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de7b795..959fda0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler +- Mention hrefs in IPD crawler for users of `link_regex` option ### Fixed - IPD crawler crashes on some sites diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 58e71f8..78fe0b1 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -45,7 +45,7 @@ class KitIpdFolder: def explain(self) -> None: log.explain_topic(f"Folder {self.name!r}") for file in self.files: - log.explain(f"File {file.name!r}") + log.explain(f"File {file.name!r} (href={file.url!r})") def __hash__(self) -> int: return self.name.__hash__() @@ -113,7 +113,7 @@ class KitIpdCrawler(HttpCrawler): else: file = self._extract_file(element) items.add(file) - log.explain_topic(f"Orphan file {file.name!r}") + log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items From 694ffb4d7711265d768a636cf1843e302485c62d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:28:30 +0200 Subject: [PATCH 005/147] Fix meeting date parsing Apparently the new pattern ": ," was added. This patch adds support for it. --- PFERD/crawl/ilias/kit_ilias_html.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 94b2e4b..dfe111d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -763,9 +763,14 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti """ try: date_str = re.sub(r"\s+", " ", date_str) + date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) + date_str = re.sub("(Heute|Today):", "", date_str, re.I) + date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) + date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) + date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" From bcc537468c46088f78a037fb28364866e8653bb5 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:53:37 +0200 Subject: [PATCH 006/147] Fix crawling of expanded meetings The last meeting on every page is expanded by default. Its content is then shown inline *and* in the meeting page itself. We should skip the inline content. --- PFERD/crawl/ilias/kit_ilias_html.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index dfe111d..d93684c 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -428,6 +428,12 @@ class IliasPage: element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) + # The last meeting on every page is expanded by default. + # Its content is then shown inline *and* in the meeting page itself. + # We should skip the inline content. + if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(link): + continue + if not element_type: continue if element_type == IliasElementType.MEETING: @@ -445,6 +451,26 @@ class IliasPage: return result + def _is_in_expanded_meeting(self, tag: Tag) -> bool: + """ + Returns whether a file is part of an expanded meeting. + Has false positives for meetings themselves as their title is also "in the expanded meeting content". + It is in the same general div and this whole thing is guesswork. + Therefore, you should check for meetings before passing them in this function. + """ + parents: List[Tag] = list(tag.parents) + for parent in parents: + if not parent.get("class"): + continue + + # We should not crawl files under meetings + if "ilContainerListItemContentCB" in parent.get("class"): + link: Tag = parent.parent.find("a") + type = IliasPage._find_type_from_folder_like(link, self._page_url) + return type == IliasElementType.MEETING + + return False + def _find_upwards_folder_hierarchy(self, tag: Tag) -> List[str]: """ Interprets accordions and expandable blocks as virtual folders and returns them From 2f0e04ce13ebbc7c7ccaa93e03d8f707f246ceef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Thu, 5 May 2022 22:57:55 +0200 Subject: [PATCH 007/147] Adjust changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 959fda0..4249287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ ambiguous situations. ### Fixed - IPD crawler crashes on some sites +- Meeting name normalization for yesterday, today and tomorrow fails +- Crawling of meeting file previews ## 3.4.0 - 2022-05-01 From 616b0480f7c92afe11c36d2c105c99ba5f960e96 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 8 May 2022 17:39:18 +0200 Subject: [PATCH 008/147] Simplify IPD crawler link regex --- CHANGELOG.md | 5 +++-- CONFIG.md | 2 +- PFERD/crawl/kit_ipd_crawler.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4249287..e2d3840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,11 +24,12 @@ ambiguous situations. ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler -- Mention hrefs in IPD crawler for users of `link_regex` option +- Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option +- Simplify default IPD crawler `link_regex` ### Fixed - IPD crawler crashes on some sites -- Meeting name normalization for yesterday, today and tomorrow fails +- Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews ## 3.4.0 - 2022-05-01 diff --git a/CONFIG.md b/CONFIG.md index 1355c34..f572a80 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -146,7 +146,7 @@ requests is likely a good idea. - `target`: URL to a KIT-IPD page - `link_regex`: A regex that is matched against the `href` part of links. If it matches, the given link is downloaded as a file. This is used to extract - files from KIT-IPD pages. (Default: `^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$`) + files from KIT-IPD pages. (Default: `^.*?[^/]+\.(pdf|zip|c|cpp|java)$`) ### The `kit-ilias-web` crawler diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 78fe0b1..d9fac32 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -27,7 +27,7 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target def link_regex(self) -> Pattern[str]: - regex = self.s.get("link_regex", r"^.*/[^/]*\.(?:pdf|zip|c|cpp|java)$") + regex = self.s.get("link_regex", r"^.*?[^/]+\.(pdf|zip|c|cpp|java)$") return re.compile(regex) From a5015fe9b16d484613a27687f2c122b15e109ba2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 8 May 2022 23:21:18 +0200 Subject: [PATCH 009/147] Correctly parse day-only meeting dates I failed to recognize the correct format in the previous adjustment, so this (hopefully) fixes it for good. Meetings apparently don't always have a time portion. --- PFERD/crawl/ilias/kit_ilias_html.py | 48 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d93684c..6d063b6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -746,17 +746,26 @@ class IliasPage: Normalizes meeting names, which have a relative time as their first part, to their date in ISO format. """ - date_portion_str = meeting_name.split(" - ")[0] + + # This checks whether we can reach a `:` without passing a `-` + if re.search(r"^[^-]+: ", meeting_name): + # Meeting name only contains date: "05. Jan 2000:" + split_delimiter = ":" + else: + # Meeting name contains date and start/end times: "05. Jan 2000, 16:00 - 17:30:" + split_delimiter = ", " + + # We have a meeting day without time + date_portion_str = meeting_name.split(split_delimiter)[0] date_portion = demangle_date(date_portion_str) + # We failed to parse the date, bail out if not date_portion: return meeting_name - rest_of_name = meeting_name - if rest_of_name.startswith(date_portion_str): - rest_of_name = rest_of_name[len(date_portion_str):] - - return datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") + rest_of_name + # Replace the first section with the absolute date + rest_of_name = split_delimiter.join(meeting_name.split(split_delimiter)[1:]) + return datetime.strftime(date_portion, "%Y-%m-%d") + split_delimiter + rest_of_name def _abs_url_from_link(self, link_tag: Tag) -> str: """ @@ -781,17 +790,15 @@ english_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ - Demangle a given date in one of the following formats: + Demangle a given date in one of the following formats (hour/minute part is optional): "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" "dd. mon yyyy, HH:MM """ try: + # Normalize whitespace because users date_str = re.sub(r"\s+", " ", date_str) - date_str = re.sub("(Gestern|Yesterday):", "", date_str, re.I) - date_str = re.sub("(Heute|Today):", "", date_str, re.I) - date_str = re.sub("(Morgen|Tomorrow):", "", date_str, re.I) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) @@ -802,19 +809,28 @@ def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[dateti # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" date_str = date_str.replace(english + ".", english) - # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" - day_part, time_part = date_str.split(",") + # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" + + # Check if we have a time as well + if ", " in date_str: + day_part, time_part = date_str.split(",") + else: + day_part = date_str.split(",")[0] + time_part = None + day_str, month_str, year_str = day_part.split(" ") day = int(day_str.strip().replace(".", "")) month = english_months.index(month_str.strip()) + 1 year = int(year_str.strip()) - hour_str, minute_str = time_part.split(":") - hour = int(hour_str) - minute = int(minute_str) + if time_part: + hour_str, minute_str = time_part.split(":") + hour = int(hour_str) + minute = int(minute_str) + return datetime(year, month, day, hour, minute) - return datetime(year, month, day, hour, minute) + return datetime(year, month, day) except Exception: if not fail_silently: log.warn(f"Date parsing failed for {date_str!r}") From 846c29aee1867f7f0b7efae802af47fee77a3ec6 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 11 May 2022 21:16:09 +0200 Subject: [PATCH 010/147] Download page descriptions --- CHANGELOG.md | 3 + PFERD/crawl/ilias/ilias_html_cleaner.py | 91 ++++++++++++++++++++++ PFERD/crawl/ilias/kit_ilias_html.py | 25 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 +++++++ 4 files changed, 148 insertions(+) create mode 100644 PFERD/crawl/ilias/ilias_html_cleaner.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d3840..b7cad13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Download of page descriptions + ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler - Mention hrefs in IPD crawler's `--explain` output for users of `link_regex` option diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py new file mode 100644 index 0000000..5952309 --- /dev/null +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup, Comment, Tag + +_STYLE_TAG_CONTENT = """ + .ilc_text_block_Information { + background-color: #f5f7fa; + } + div.ilc_text_block_Standard { + margin-bottom: 10px; + margin-top: 10px; + } + span.ilc_text_inline_Strong { + font-weight: bold; + } + + .accordion-head { + background-color: #f5f7fa; + padding: 0.5rem 0; + } + + h3 { + margin-top: 0.5rem; + margin-bottom: 1rem; + } + + br.visible-break { + margin-bottom: 1rem; + } + + article { + margin: 0.5rem 0; + } + + body { + padding: 1em; + grid-template-columns: 1fr min(60rem, 90%) 1fr; + line-height: 1.2; + } +""" + +_ARTICLE_WORTHY_CLASSES = [ + "ilc_text_block_Information", + "ilc_section_Attention", + "ilc_section_Link", +] + + +def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: + head = soup.new_tag("head") + soup.insert(0, head) + + simplecss_link: Tag = soup.new_tag("link") + # + simplecss_link["rel"] = "stylesheet" + simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" + head.append(simplecss_link) + + # Basic style tags for compat + style: Tag = soup.new_tag("style") + style.append(_STYLE_TAG_CONTENT) + head.append(style) + + return soup + + +def clean(soup: BeautifulSoup) -> BeautifulSoup: + for block in soup.find_all(class_=lambda x: x in _ARTICLE_WORTHY_CLASSES): + block.name = "article" + + for block in soup.find_all("h3"): + block.name = "div" + + for block in soup.find_all("h1"): + block.name = "h3" + + for block in soup.find_all(class_="ilc_va_ihcap_VAccordIHeadCap"): + block.name = "h3" + block["class"] += ["accordion-head"] + + for dummy in soup.select(".ilc_text_block_Standard.ilc_Paragraph"): + children = list(dummy.children) + if not children: + dummy.decompose() + if len(children) > 1: + continue + if type(children[0]) == Comment: + dummy.decompose() + + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): + hrule_imposter.insert(0, soup.new_tag("hr")) + + return soup diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 6d063b6..d58e5c8 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -85,6 +85,31 @@ class IliasPage: log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries() + def get_description(self) -> Optional[BeautifulSoup]: + def is_interesting_class(name: str) -> bool: + return name in ["ilCOPageSection", "ilc_Paragraph", "ilc_va_ihcap_VAccordIHeadCap"] + + paragraphs: List[Tag] = self._soup.findAll(class_=is_interesting_class) + if not paragraphs: + return None + + # Extract bits and pieces into a string and parse it again. + # This ensures we don't miss anything and weird structures are resolved + # somewhat gracefully. + raw_html = "" + for p in paragraphs: + if p.find_parent(class_=is_interesting_class): + continue + + # Ignore special listings (like folder groupings) + if "ilc_section_Special" in p["class"]: + continue + + raw_html += str(p) + "\n" + raw_html = f"\n{raw_html}\n" + + return BeautifulSoup(raw_html, "html.parser") + def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae9ebd4..bbed986 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -17,6 +17,7 @@ from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links +from .ilias_html_cleaner import clean, insert_base_markup from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement TargetType = Union[str, int] @@ -215,6 +216,8 @@ instance's greatest bottleneck. cl = maybe_cl # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608 elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling url") async def gather_elements() -> None: @@ -233,9 +236,15 @@ instance's greatest bottleneck. page = IliasPage(soup, url, None) elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) + # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -265,6 +274,8 @@ instance's greatest bottleneck. cl: CrawlToken, ) -> None: elements: List[IliasPageElement] = [] + # A list as variable redefinitions are not propagated to outer scopes + description: List[BeautifulSoup] = [] @_iorepeat(3, "crawling folder") async def gather_elements() -> None: @@ -285,10 +296,15 @@ instance's greatest bottleneck. next_stage_url = None elements.extend(page.get_child_elements()) + if description_string := page.get_description(): + description.append(description_string) # Fill up our task list with the found elements await gather_elements() + if description: + await self._download_description(PurePath("."), description[0]) + elements.sort(key=lambda e: e.id()) tasks: List[Awaitable[None]] = [] @@ -425,6 +441,19 @@ instance's greatest bottleneck. return self._download_booking(element, link_template_maybe, maybe_dl) + @anoncritical + @_iorepeat(1, "downloading description") + async def _download_description(self, parent_path: PurePath, description: BeautifulSoup) -> None: + path = parent_path / "Description.html" + dl = await self.download(path, redownload=Redownload.ALWAYS) + if not dl: + return + + async with dl as (bar, sink): + description = clean(insert_base_markup(description)) + sink.file.write(description.prettify().encode("utf-8")) + sink.done() + @anoncritical @_iorepeat(3, "resolving booking") async def _download_booking( From 46fb782798725b6fde76b71cf7a4d90912ea2c7d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 24 May 2022 23:28:09 +0200 Subject: [PATCH 011/147] Add forum crawling This downloads all forum posts when needed and saves each thread in its own html file, named after the thread title. --- CHANGELOG.md | 1 + PFERD/cli/command_kit_ilias_web.py | 7 ++ PFERD/crawl/ilias/kit_ilias_html.py | 90 ++++++++++++++- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 122 ++++++++++++++++++--- PFERD/logging.py | 4 +- 5 files changed, 208 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7cad13..1d70c4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Added - Download of page descriptions +- Forum download support ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/PFERD/cli/command_kit_ilias_web.py b/PFERD/cli/command_kit_ilias_web.py index 12803a6..de74fc3 100644 --- a/PFERD/cli/command_kit_ilias_web.py +++ b/PFERD/cli/command_kit_ilias_web.py @@ -62,6 +62,11 @@ GROUP.add_argument( action=BooleanOptionalAction, help="crawl and download videos" ) +GROUP.add_argument( + "--forums", + action=BooleanOptionalAction, + help="crawl and download forum posts" +) GROUP.add_argument( "--http-timeout", "-t", type=float, @@ -90,6 +95,8 @@ def load( section["link_redirect_delay"] = str(args.link_redirect_delay) if args.videos is not None: section["videos"] = "yes" if args.videos else "no" + if args.forums is not None: + section["forums"] = "yes" if args.forums else "no" if args.http_timeout is not None: section["http_timeout"] = str(args.http_timeout) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d58e5c8..7bab152 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -55,6 +55,20 @@ class IliasPageElement: return self.url +@dataclass +class IliasDownloadForumData: + url: str + form_data: Dict[str, Union[str, List[str]]] + + +@dataclass +class IliasForumThread: + title: str + title_tag: Tag + content_tag: Tag + mtime: Optional[datetime] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -110,13 +124,39 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: + form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) + if not form: + return None + post_url = self._abs_url_from_relative(form["action"]) + + form_data: Dict[str, Union[str, List[ſtr]]] = { + "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "selected_cmd2": "html", + "select_cmd2": "Ausführen", + "selected_cmd": "", + } + + return IliasDownloadForumData(post_url, form_data) + def get_next_stage_element(self) -> Optional[IliasPageElement]: + if self._is_forum_page(): + if "trows=800" in self._page_url: + return None + return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: return self._find_video_entries_paginated()[0] return None + def _is_forum_page(self) -> bool: + read_more_btn = self._soup.find( + "button", + attrs={"onclick": lambda x: x and "cmdClass=ilobjforumgui&cmd=markAllRead" in x} + ) + return read_more_btn is not None + def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) @@ -194,6 +234,19 @@ class IliasPage: return items + def _get_show_max_forum_entries_per_page_url(self) -> Optional[IliasPageElement]: + correct_link = self._soup.find( + "a", + attrs={"href": lambda x: x and "trows=800" in x and "cmd=showThreads" in x} + ) + + if not correct_link: + return None + + link = self._abs_url_from_link(correct_link) + + return IliasPageElement(IliasElementType.FORUM, link, "show all forum threads") + def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] @@ -877,3 +930,38 @@ def _tomorrow() -> date: def _sanitize_path_name(name: str) -> str: return name.replace("/", "-").replace("\\", "-").strip() + + +def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThread]: + elements = [] + for p in forum_export.select("body > p"): + title_tag = p + content_tag = p.find_next_sibling("ul") + title = p.find("b").text + if ":" in title: + title = title[title.find(":") + 1:] + title = title.strip() + mtime = _guess_timestamp_from_forum_post_content(content_tag) + elements.append(IliasForumThread(title, title_tag, content_tag, mtime)) + + return elements + + +def _guess_timestamp_from_forum_post_content(content: Tag) -> Optional[datetime]: + posts: Optional[Tag] = content.select(".ilFrmPostHeader > span.small") + if not posts: + return None + + newest_date: Optional[datetime] = None + + for post in posts: + text = post.text.strip() + text = text[text.rfind("|") + 1:] + date = demangle_date(text, fail_silently=True) + if not date: + continue + + if not newest_date or newest_date < date: + newest_date = date + + return newest_date diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bbed986..156cd4c 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -18,7 +18,8 @@ from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadTo from ..http_crawler import HttpCrawler, HttpCrawlerSection from .file_templates import Links from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import IliasElementType, IliasPage, IliasPageElement +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, + _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -67,6 +68,9 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): def videos(self) -> bool: return self.s.getboolean("videos", fallback=False) + def forums(self) -> bool: + return self.s.getboolean("forums", fallback=False) + _DIRECTORY_PAGES: Set[IliasElementType] = set([ IliasElementType.EXERCISE, @@ -183,6 +187,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._forums = section.forums() self._visited_urls: Set[str] = set() async def _run(self) -> None: @@ -335,22 +340,27 @@ instance's greatest bottleneck. element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: - log.explain_topic(f"Decision: Crawl video element {fmt_path(element_path)}") if not self._videos: - log.explain("Video crawling is disabled") - log.explain("Answer: no") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'videos')" + ) return None - else: - log.explain("Video crawling is enabled") - log.explain("Answer: yes") if element.type == IliasElementType.FILE: return await self._handle_file(element, element_path) elif element.type == IliasElementType.FORUM: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Forums are not supported") - log.explain("Answer: No") - return None + if not self._forums: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](enable with option 'forums')" + ) + return None + return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") @@ -635,6 +645,68 @@ instance's greatest bottleneck. if not await try_stream(): raise CrawlError("File streaming failed after authenticate()") + async def _handle_forum( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_forum(element, maybe_cl) + + @_iorepeat(3, "crawling forum") + @anoncritical + async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements = [] + + async with cl: + next_stage_url = element.url + while next_stage_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + + soup = await self._get_page(next_stage_url) + page = IliasPage(soup, next_stage_url, None) + + if next := page.get_next_stage_element(): + next_stage_url = next.url + else: + break + + download_data = page.get_download_forum_data() + if not download_data: + raise CrawlWarning("Failed to extract forum data") + html = await self._post_authenticated(download_data.url, download_data.form_data) + elements = parse_ilias_forum_export(soupify(html)) + + elements.sort(key=lambda elem: elem.title) + + tasks: List[Awaitable[None]] = [] + for elem in elements: + tasks.append(asyncio.create_task(self._download_forum_thread(cl.path, elem))) + + # And execute them + await self.gather(tasks) + + @anoncritical + @_iorepeat(3, "saving forum thread") + async def _download_forum_thread( + self, + parent_path: PurePath, + element: IliasForumThread, + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path, mtime=element.mtime) + if not maybe_dl: + return + + async with maybe_dl as (bar, sink): + content = element.title_tag.prettify() + content += element.content_tag.prettify() + sink.file.write(content.encode("utf-8")) + sink.done() + async def _get_page(self, url: str) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -652,13 +724,37 @@ instance's greatest bottleneck. return soup raise CrawlError("get_page failed even after authenticating") + async def _post_authenticated( + self, + url: str, + data: dict[str, Union[str, List[str]]] + ) -> BeautifulSoup: + auth_id = await self._current_auth_id() + + form_data = aiohttp.FormData() + for key, val in data.items(): + form_data.add_field(key, val) + + async with self.session.post(url, data=form_data(), allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.post(url, data=data, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("post_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. - @_iorepeat(3, "Login", failure_is_error=True) + @ _iorepeat(3, "Login", failure_is_error=True) async def _authenticate(self) -> None: await self._shibboleth_login.login(self.session) - @staticmethod + @ staticmethod def _is_logged_in(soup: BeautifulSoup) -> bool: # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") diff --git a/PFERD/logging.py b/PFERD/logging.py index e833716..340b21f 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -197,7 +197,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_explain: self.print(f" {escape(text)}") - def status(self, style: str, action: str, text: str) -> None: + def status(self, style: str, action: str, text: str, suffix: str = "") -> None: """ Print a status update while crawling. Allows markup in the "style" argument which will be applied to the "action" string. @@ -205,7 +205,7 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_status: action = escape(f"{action:<{self.STATUS_WIDTH}}") - self.print(f"{style}{action}[/] {escape(text)}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") def report(self, text: str) -> None: """ From ed24366aba7cfb8ca3cdd0df7b2650bc1220437f Mon Sep 17 00:00:00 2001 From: Joscha Date: Sat, 15 Jan 2022 16:23:37 +0100 Subject: [PATCH 012/147] Add pass authenticator --- CHANGELOG.md | 1 + CONFIG.md | 21 ++++++++- PFERD/auth/__init__.py | 3 ++ PFERD/auth/pass_.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 PFERD/auth/pass_.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d70c4a..bc9f3e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Added - Download of page descriptions - Forum download support +- `pass` authenticator ### Changed - Add `cpp` extension to default `link_regex` of IPD crawler diff --git a/CONFIG.md b/CONFIG.md index f572a80..0f114ed 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -223,6 +223,23 @@ is stored in the keyring. - `keyring_name`: The service name PFERD uses for storing credentials. (Default: `PFERD`) +### The `pass` authenticator + +This authenticator queries the [`pass` password manager][3] for a username and +password. It tries to be mostly compatible with [browserpass][4] and +[passff][5], so see those links for an overview of the format. If PFERD fails +to load your password, you can use the `--explain` flag to see why. + +- `passname`: The name of the password to use (Required) +- `username_prefixes`: A comma-separated list of username line prefixes + (Default: `login,username,user`) +- `password_prefixes`: A comma-separated list of password line prefixes + (Default: `password,pass,secret`) + +[3]: "Pass: The Standard Unix Password Manager" +[4]: "Organizing password store" +[5]: "Multi-line format" + ### The `tfa` authenticator This authenticator prompts the user on the console for a two-factor @@ -316,7 +333,7 @@ is a regular expression and `TARGET` an f-string based template. If a path matches `SOURCE`, the output path is created using `TARGET` as template. `SOURCE` is automatically anchored. -`TARGET` uses Python's [format string syntax][3]. The *n*-th capturing group can +`TARGET` uses Python's [format string syntax][6]. The *n*-th capturing group can be referred to as `{g}` (e.g. `{g3}`). `{g0}` refers to the original path. If capturing group *n*'s contents are a valid integer, the integer value is available as `{i}` (e.g. `{i3}`). If capturing group *n*'s contents are a @@ -337,7 +354,7 @@ Example: `f(oo+)/be?ar -re-> B{g1.upper()}H/fear` - Converts `fooooo/bear` into `BOOOOOH/fear` - Converts `foo/bar/baz` into `BOOH/fear/baz` -[3]: "Format String Syntax" +[6]: "Format String Syntax" ### The `-name-re->` arrow diff --git a/PFERD/auth/__init__.py b/PFERD/auth/__init__.py index 277cade..aa3ba8e 100644 --- a/PFERD/auth/__init__.py +++ b/PFERD/auth/__init__.py @@ -5,6 +5,7 @@ from ..config import Config from .authenticator import Authenticator, AuthError, AuthLoadError, AuthSection # noqa: F401 from .credential_file import CredentialFileAuthenticator, CredentialFileAuthSection from .keyring import KeyringAuthenticator, KeyringAuthSection +from .pass_ import PassAuthenticator, PassAuthSection from .simple import SimpleAuthenticator, SimpleAuthSection from .tfa import TfaAuthenticator @@ -19,6 +20,8 @@ AUTHENTICATORS: Dict[str, AuthConstructor] = { CredentialFileAuthenticator(n, CredentialFileAuthSection(s), c), "keyring": lambda n, s, c: KeyringAuthenticator(n, KeyringAuthSection(s)), + "pass": lambda n, s, c: + PassAuthenticator(n, PassAuthSection(s)), "simple": lambda n, s, c: SimpleAuthenticator(n, SimpleAuthSection(s)), "tfa": lambda n, s, c: diff --git a/PFERD/auth/pass_.py b/PFERD/auth/pass_.py new file mode 100644 index 0000000..4c8e775 --- /dev/null +++ b/PFERD/auth/pass_.py @@ -0,0 +1,98 @@ +import re +import subprocess +from typing import List, Tuple + +from ..logging import log +from .authenticator import Authenticator, AuthError, AuthSection + + +class PassAuthSection(AuthSection): + def passname(self) -> str: + if (value := self.s.get("passname")) is None: + self.missing_value("passname") + return value + + def username_prefixes(self) -> List[str]: + value = self.s.get("username_prefixes", "login,username,user") + return [prefix.lower() for prefix in value.split(",")] + + def password_prefixes(self) -> List[str]: + value = self.s.get("password_prefixes", "password,pass,secret") + return [prefix.lower() for prefix in value.split(",")] + + +class PassAuthenticator(Authenticator): + PREFIXED_LINE_RE = r"([a-zA-Z]+):\s?(.*)" # to be used with fullmatch + + def __init__(self, name: str, section: PassAuthSection) -> None: + super().__init__(name) + + self._passname = section.passname() + self._username_prefixes = section.username_prefixes() + self._password_prefixes = section.password_prefixes() + + async def credentials(self) -> Tuple[str, str]: + log.explain_topic("Obtaining credentials from pass") + + try: + log.explain(f"Calling 'pass show {self._passname}'") + result = subprocess.check_output(["pass", "show", self._passname], text=True) + except subprocess.CalledProcessError as e: + raise AuthError(f"Failed to get password info from {self._passname}: {e}") + + prefixed = {} + unprefixed = [] + for line in result.strip().splitlines(): + if match := re.fullmatch(self.PREFIXED_LINE_RE, line): + prefix = match.group(1).lower() + value = match.group(2) + log.explain(f"Found prefixed line {line!r} with prefix {prefix!r}, value {value!r}") + if prefix in prefixed: + raise AuthError(f"Prefix {prefix} specified multiple times") + prefixed[prefix] = value + else: + log.explain(f"Found unprefixed line {line!r}") + unprefixed.append(line) + + username = None + for prefix in self._username_prefixes: + log.explain(f"Looking for username at prefix {prefix!r}") + if prefix in prefixed: + username = prefixed[prefix] + log.explain(f"Found username {username!r}") + break + + password = None + for prefix in self._password_prefixes: + log.explain(f"Looking for password at prefix {prefix!r}") + if prefix in prefixed: + password = prefixed[prefix] + log.explain(f"Found password {password!r}") + break + + if password is None and username is None: + log.explain("No username and password found so far") + log.explain("Using first unprefixed line as password") + log.explain("Using second unprefixed line as username") + elif password is None: + log.explain("No password found so far") + log.explain("Using first unprefixed line as password") + elif username is None: + log.explain("No username found so far") + log.explain("Using first unprefixed line as username") + + if password is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Password could not be determined") + password = unprefixed.pop(0) + log.explain(f"Found password {password!r}") + + if username is None: + if not unprefixed: + log.explain("Not enough unprefixed lines left") + raise AuthError("Username could not be determined") + username = unprefixed.pop(0) + log.explain(f"Found username {username!r}") + + return username, password From 345f52a1f6f55eecf6c31d3cc1a4350c5200087d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:41:29 +0200 Subject: [PATCH 013/147] Detect new login button --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 156cd4c..c99a920 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -759,7 +759,7 @@ instance's greatest bottleneck. # Normal ILIAS pages mainbar: Optional[Tag] = soup.find(class_="il-maincontrols-metabar") if mainbar is not None: - login_button = mainbar.find("button", attrs={"data-action": lambda x: x and "login.php" in x}) + login_button = mainbar.find(attrs={"href": lambda x: x and "login.php" in x}) shib_login = soup.find(id="button_shib_login") return not login_button and not shib_login From d9b111cec252f4b1810f06b0f2ca551cb5cdb2a2 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:45:33 +0200 Subject: [PATCH 014/147] Correctly nest description entries --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c99a920..1852c5f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -308,7 +308,7 @@ instance's greatest bottleneck. await gather_elements() if description: - await self._download_description(PurePath("."), description[0]) + await self._download_description(cl.path, description[0]) elements.sort(key=lambda e: e.id()) From aa5a3a10bcbfa0dd54a0dc1a533625f76b2d6ed8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 14 Aug 2022 21:48:59 +0200 Subject: [PATCH 015/147] Adjust changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc9f3e5..7f35c9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,9 @@ ambiguous situations. - IPD crawler crashes on some sites - Meeting name normalization for yesterday, today and tomorrow - Crawling of meeting file previews +- Login with new login button html layout +- Descriptions for courses are now placed in the correct subfolder when + downloading the whole desktop ## 3.4.0 - 2022-05-01 From 66a5b1ba0223848f713192b084f2dcd26a18dbe5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 17 Aug 2022 13:24:01 +0200 Subject: [PATCH 016/147] Bump version to 3.4.1 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f35c9c..671d48a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.1 - 2022-08-17 + ### Added - Download of page descriptions - Forum download support diff --git a/PFERD/version.py b/PFERD/version.py index 8102d37..8832a51 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.0" +VERSION = "3.4.1" From 4a51aaa4f5a1b3382f0bed59f1292fc0952c2832 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 19 Oct 2022 22:59:33 +0200 Subject: [PATCH 017/147] Fix forum crawling crashing for empty threads --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 671d48a..70d2cd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Forum crawling crashing when parsing empty (= 0 messages) threads + ## 3.4.1 - 2022-08-17 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 7bab152..8795512 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -937,6 +937,13 @@ def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThre for p in forum_export.select("body > p"): title_tag = p content_tag = p.find_next_sibling("ul") + + if not content_tag: + # ILIAS allows users to delete the initial post while keeping the thread open + # This produces empty threads without *any* content. + # I am not sure why you would want this, but ILIAS makes it easy to do. + continue + title = p.find("b").text if ":" in title: title = title[title.find(":") + 1:] From d72fc2760b1dd8243ccf21876bb8cc6e027944bb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:09:29 +0200 Subject: [PATCH 018/147] Handle empty forums --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 7 +++++-- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70d2cd5..c7a9899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a forum has no threads at all ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 8795512..9ea6b9f 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -59,6 +59,7 @@ class IliasPageElement: class IliasDownloadForumData: url: str form_data: Dict[str, Union[str, List[str]]] + empty: bool @dataclass @@ -130,14 +131,16 @@ class IliasPage: return None post_url = self._abs_url_from_relative(form["action"]) + thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] + form_data: Dict[str, Union[str, List[ſtr]]] = { - "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})], + "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", "selected_cmd": "", } - return IliasDownloadForumData(post_url, form_data) + return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0) def get_next_stage_element(self) -> Optional[IliasPageElement]: if self._is_forum_page(): diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 1852c5f..f2d5215 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -658,7 +658,7 @@ instance's greatest bottleneck. @_iorepeat(3, "crawling forum") @anoncritical async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None: - elements = [] + elements: List[IliasForumThread] = [] async with cl: next_stage_url = element.url @@ -677,6 +677,10 @@ instance's greatest bottleneck. download_data = page.get_download_forum_data() if not download_data: raise CrawlWarning("Failed to extract forum data") + if download_data.empty: + log.explain("Forum had no threads") + elements = [] + return html = await self._post_authenticated(download_data.url, download_data.form_data) elements = parse_ilias_forum_export(soupify(html)) From fb4631ba180a9ff0303d59e798d4bccfa0253666 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 13:13:36 +0200 Subject: [PATCH 019/147] Fix ilias background login --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index f2d5215..10a270f 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -23,6 +23,12 @@ from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, Ilia TargetType = Union[str, int] +_ILIAS_URL = "https://ilias.studium.kit.edu" + + +class KitShibbolethBackgroundLoginSuccessful(): + pass + class KitIliasWebCrawlerSection(HttpCrawlerSection): def target(self) -> TargetType: @@ -36,7 +42,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection): if target == "desktop": # Full personal desktop return target - if target.startswith("https://ilias.studium.kit.edu"): + if target.startswith(_ILIAS_URL): # ILIAS URL return target @@ -181,7 +187,7 @@ instance's greatest bottleneck. section.tfa_auth(authenticators), ) - self._base_url = "https://ilias.studium.kit.edu" + self._base_url = _ILIAS_URL self._target = section.target() self._link_file_redirect_delay = section.link_redirect_delay() @@ -808,14 +814,17 @@ class KitShibbolethLogin: # Equivalent: Click on "Mit KIT-Account anmelden" button in # https://ilias.studium.kit.edu/login.php - url = "https://ilias.studium.kit.edu/shib_login.php" + url = f"{_ILIAS_URL}/shib_login.php" data = { "sendLogin": "1", "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth", "il_target": "", "home_organization_selection": "Weiter", } - soup: BeautifulSoup = await _shib_post(sess, url, data) + soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data) + + if isinstance(soup, KitShibbolethBackgroundLoginSuccessful): + return # Attempt to login using credentials, if necessary while not self._login_successful(soup): @@ -854,7 +863,7 @@ class KitShibbolethLogin: # (or clicking "Continue" if you have JS disabled) relay_state = soup.find("input", {"name": "RelayState"}) saml_response = soup.find("input", {"name": "SAMLResponse"}) - url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST" + url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST" data = { # using the info obtained in the while loop above "RelayState": relay_state["value"], "SAMLResponse": saml_response["value"], @@ -903,22 +912,35 @@ async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> Beautifu return soupify(await response.read()) -async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup: +async def _shib_post( + session: aiohttp.ClientSession, + url: str, + data: Any +) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]: """ aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and build encoded URL objects ourselves... Who thought mangling location header was a good idea?? """ + log.explain_topic("Shib login POST") async with session.post(url, data=data, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Got location {location!r}") if not location: raise CrawlWarning(f"Login failed (1), no location header present at {url}") correct_url = yarl.URL(location, encoded=True) + log.explain(f"Corrected location to {correct_url!r}") + + if str(correct_url).startswith(_ILIAS_URL): + log.explain("ILIAS recognized our shib token and logged us in in the background, returning") + return KitShibbolethBackgroundLoginSuccessful() async with session.get(correct_url, allow_redirects=False) as response: location = response.headers.get("location") + log.explain(f"Redirected to {location!r} with status {response.status}") # If shib still still has a valid session, it will directly respond to the request if location is None: + log.explain("Shib recognized us, returning its response directly") return soupify(await response.read()) as_yarl = yarl.URL(response.url) @@ -932,6 +954,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea path=location, encoded=True ) + log.explain(f"Corrected location to {correct_url!r}") async with session.get(correct_url, allow_redirects=False) as response: return soupify(await response.read()) From 5fdd40204b156b15c008ec1dee05e168672fe243 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 14:33:58 +0200 Subject: [PATCH 020/147] Unwrap future meetings when ILIAS hides them behind a pagination --- PFERD/crawl/ilias/kit_ilias_html.py | 20 +++++++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 29 ++++++++++++++-------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 9ea6b9f..2f0011e 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -146,11 +146,17 @@ class IliasPage: if self._is_forum_page(): if "trows=800" in self._page_url: return None + log.explain("Requesting *all* forum threads") return self._get_show_max_forum_entries_per_page_url() if self._is_ilias_opencast_embedding(): + log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + log.explain("Unwrapping video pagination") return self._find_video_entries_paginated()[0] + if self._contains_collapsed_future_meetings(): + log.explain("Requesting *all* future meetings") + return self._uncollapse_future_meetings_url() return None def _is_forum_page(self) -> bool: @@ -203,6 +209,16 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: + return self._uncollapse_future_meetings_url() is not None + + def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]: + element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x}) + if not element: + return None + link = self._abs_url_from_link(element) + return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere @@ -793,6 +809,10 @@ class IliasPage: if img_tag is None: img_tag = found_parent.select_one("img.icon") + if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}): + log.explain("Found session expansion button, skipping it as it has no content") + return None + if img_tag is None: _unexpected_html_warning() log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 10a270f..bc0d816 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -234,19 +234,28 @@ instance's greatest bottleneck. async def gather_elements() -> None: elements.clear() async with cl: - soup = await self._get_page(url) - - if expected_id is not None: - perma_link_element: Tag = soup.find(id="current_perma_link") - if not perma_link_element or "crs_" not in perma_link_element.get("value"): - raise CrawlError("Invalid course id? Didn't find anything looking like a course") + next_stage_url: Optional[str] = url + current_parent = None # Duplicated code, but the root page is special - we want to avoid fetching it twice! - log.explain_topic("Parsing root HTML page") - log.explain(f"URL: {url}") - page = IliasPage(soup, url, None) - elements.extend(page.get_child_elements()) + while next_stage_url: + soup = await self._get_page(next_stage_url) + if current_parent is None and expected_id is not None: + perma_link_element: Tag = soup.find(id="current_perma_link") + if not perma_link_element or "crs_" not in perma_link_element.get("value"): + raise CrawlError("Invalid course id? Didn't find anything looking like a course") + + log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {next_stage_url}") + page = IliasPage(soup, next_stage_url, current_parent) + if next_element := page.get_next_stage_element(): + current_parent = next_element + next_stage_url = next_element.url + else: + next_stage_url = None + + elements.extend(page.get_child_elements()) if description_string := page.get_description(): description.append(description_string) From e1430e629844ad122a78d18197ed54100c734bbb Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:34 +0200 Subject: [PATCH 021/147] Handle (and ignore) surveys --- PFERD/crawl/ilias/kit_ilias_html.py | 3 +++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 2f0011e..d969577 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -24,6 +24,7 @@ class IliasElementType(Enum): LINK = "link" BOOKING = "booking" MEETING = "meeting" + SURVEY = "survey" VIDEO = "video" VIDEO_PLAYER = "video_player" VIDEO_FOLDER = "video_folder" @@ -730,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "svy" in icon["class"]: + return IliasElementType.SURVEY _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index bc0d816..5ff8212 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -380,6 +380,13 @@ instance's greatest bottleneck. log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") log.explain("Tests contain no relevant files") log.explain("Answer: No") + elif element.type == IliasElementType.SURVEY: + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](surveys contain no relevant data)" + ) return None elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) From 1b6be6bd79112faea6e56c43f4756dde10ba00ba Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 24 Oct 2022 18:36:54 +0200 Subject: [PATCH 022/147] Handle content pages in cards --- PFERD/crawl/ilias/kit_ilias_html.py | 2 ++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index d969577..ee0364a 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -731,6 +731,8 @@ class IliasPage: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER + if "copa" in icon["class"]: + return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 5ff8212..9295e93 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -377,9 +377,13 @@ instance's greatest bottleneck. return None return await self._handle_forum(element, element_path) elif element.type == IliasElementType.TEST: - log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}") - log.explain("Tests contain no relevant files") - log.explain("Answer: No") + log.status( + "[bold bright_black]", + "Ignored", + fmt_path(element_path), + "[bright_black](tests contain no relevant data)" + ) + return None elif element.type == IliasElementType.SURVEY: log.status( "[bold bright_black]", From f47d2f11d843bfd3307815b231dd3e3df0265cef Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 25 Oct 2022 20:28:06 +0200 Subject: [PATCH 023/147] Append trailing slash to kit-ipd links to ensure urljoin works as expected --- CHANGELOG.md | 1 + PFERD/crawl/kit_ipd_crawler.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7a9899..24d9fa6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Forum crawling crashing when parsing empty (= 0 messages) threads - Forum crawling crashing when a forum has no threads at all +- kit-ipd crawler if URL did not end with a trailing slash ## 3.4.1 - 2022-08-17 diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index d9fac32..338e059 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -24,6 +24,9 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") + if not target.endswith("/"): + target = target + "/" + return target def link_regex(self) -> Pattern[str]: From 37b51a66d87d368afc3bef2b81edf1629f95cd57 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:22:37 +0200 Subject: [PATCH 024/147] Update changelog --- CHANGELOG.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24d9fa6..2bb0231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,10 +22,16 @@ ambiguous situations. ## Unreleased +### Added +- Recognize and crawl content pages in cards +- Recognize and ignore surveys + ### Fixed -- Forum crawling crashing when parsing empty (= 0 messages) threads +- Forum crawling crashing when a thread has no messages at all - Forum crawling crashing when a forum has no threads at all -- kit-ipd crawler if URL did not end with a trailing slash +- Ilias login failing in some cases +- Crawling of paginated future meetings +- IPD crawler handling of URLs without trailing slash ## 3.4.1 - 2022-08-17 From 259cfc20cccae68a2f34984796405a35a7f31707 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Oct 2022 18:26:17 +0200 Subject: [PATCH 025/147] Bump version to 3.4.2 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb0231..9ecddf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.2 - 2022-10-26 + ### Added - Recognize and crawl content pages in cards - Recognize and ignore surveys diff --git a/PFERD/version.py b/PFERD/version.py index 8832a51..0ef5d89 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.1" +VERSION = "3.4.2" From c020cccc64f152882688b119416f0582ec94e074 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Oct 2022 14:08:29 +0200 Subject: [PATCH 026/147] Include found paths in "second path found" warning --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 2 +- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 8 +++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ecddf7..3dd25b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Clear up error message shown when multiple paths are found to an element + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index ee0364a..56dcf7b 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -134,7 +134,7 @@ class IliasPage: thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})] - form_data: Dict[str, Union[str, List[ſtr]]] = { + form_data: Dict[str, Union[str, List[str]]] = { "thread_ids[]": thread_ids, "selected_cmd2": "html", "select_cmd2": "Ausführen", diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 9295e93..e3719b8 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -194,7 +194,7 @@ instance's greatest bottleneck. self._links = section.links() self._videos = section.videos() self._forums = section.forums() - self._visited_urls: Set[str] = set() + self._visited_urls: Dict[str, PurePath] = dict() async def _run(self) -> None: if isinstance(self._target, int): @@ -348,9 +348,11 @@ instance's greatest bottleneck. ) -> Optional[Coroutine[Any, Any, None]]: if element.url in self._visited_urls: raise CrawlWarning( - f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + f"Found second path to element {element.name!r} at {element.url!r}. " + + f"First path: {fmt_path(self._visited_urls[element.url])}. " + + f"Second path: {fmt_path(parent_path)}." ) - self._visited_urls.add(element.url) + self._visited_urls[element.url] = parent_path element_path = PurePath(parent_path, element.name) From 07200bbde5fb72f2f846101b92b440724c8c7959 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 31 Oct 2022 14:10:45 +0100 Subject: [PATCH 027/147] Document ilias web crawler's forums option --- CHANGELOG.md | 3 +++ CONFIG.md | 1 + 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dd25b8..e5e81d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Missing documentation for `forums` option + ### Changed - Clear up error message shown when multiple paths are found to an element diff --git a/CONFIG.md b/CONFIG.md index 0f114ed..1ca43c4 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -181,6 +181,7 @@ script once per day should be fine. redirect to the actual URL. Set to a negative value to disable the automatic redirect. (Default: `-1`) - `videos`: Whether to download videos. (Default: `no`) +- `forums`: Whether to download forum threads. (Default: `no`) - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default: `20.0`) From e69b55b3496d58bc19d76429ca0078ab10f23074 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Fri, 4 Nov 2022 12:18:26 +0100 Subject: [PATCH 028/147] Add more unofficial package managers (#66) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ce917b0..31a3475 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended. Unofficial packages are available for: - [AUR](https://aur.archlinux.org/packages/pferd) +- [brew](https://formulae.brew.sh/formula/pferd) +- [conda-forge](https://github.com/conda-forge/pferd-feedstock) - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix) +- [PyPi](https://pypi.org/project/pferd) See also PFERD's [repology page](https://repology.org/project/pferd/versions). From 635caa765decd9a747d8b313252fd6b56cea0951 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 15 Nov 2022 17:17:55 +0100 Subject: [PATCH 029/147] Fix typo Thanks, burg113 --- CONFIG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONFIG.md b/CONFIG.md index 1ca43c4..640e4af 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -290,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`. Example: `foo/bar --> baz` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` - Converts `foo/bar` into `baz` -- Converts `foo/bar/wargl` into `bar/wargl` +- Converts `foo/bar/wargl` into `baz/wargl` Example: `foo/bar --> !` - Doesn't match `foo`, `a/foo/bar` or `foo/baz` From c0d6d8b22975234b0c9141a22307c8036698566c Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 21 Nov 2022 17:53:30 +0100 Subject: [PATCH 030/147] Use url after redirect for relative links --- CHANGELOG.md | 3 +++ PFERD/crawl/kit_ipd_crawler.py | 27 ++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5e81d6..5bbefd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,9 @@ ambiguous situations. ### Changed - Clear up error message shown when multiple paths are found to an element +### Fixed +- IPD crawler unnecessarily appending trailing slashes + ## 3.4.2 - 2022-10-26 ### Added diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 338e059..c852be0 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,7 +2,7 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from typing import Awaitable, List, Optional, Pattern, Set, Union +from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -24,9 +24,6 @@ class KitIpdCrawlerSection(HttpCrawlerSection): if not target.startswith("https://"): self.invalid_value("target", target, "Should be a URL") - if not target.endswith("/"): - target = target + "/" - return target def link_regex(self) -> Pattern[str]: @@ -102,32 +99,32 @@ class KitIpdCrawler(HttpCrawler): await self._stream_from_url(file.url, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: - page = await self.get_page() + page, url = await self.get_page() elements: List[Tag] = self._find_file_links(page) items: Set[Union[KitIpdFile, KitIpdFolder]] = set() for element in elements: folder_label = self._find_folder_label(element) if folder_label: - folder = self._extract_folder(folder_label) + folder = self._extract_folder(folder_label, url) if folder not in items: items.add(folder) folder.explain() else: - file = self._extract_file(element) + file = self._extract_file(element, url) items.add(file) log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})") log.explain("Attributing it to root folder") return items - def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: + def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder: files: List[KitIpdFile] = [] name = folder_tag.getText().strip() container: Tag = folder_tag.findNextSibling(name="table") for link in self._find_file_links(container): - files.append(self._extract_file(link)) + files.append(self._extract_file(link, url)) return KitIpdFolder(name, files) @@ -138,16 +135,16 @@ class KitIpdCrawler(HttpCrawler): return None return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) - def _extract_file(self, link: Tag) -> KitIpdFile: - url = self._abs_url_from_link(link) + def _extract_file(self, link: Tag, url: str) -> KitIpdFile: + url = self._abs_url_from_link(url, link) name = os.path.basename(url) return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: return tag.findAll(name="a", attrs={"href": self._file_regex}) - def _abs_url_from_link(self, link_tag: Tag) -> str: - return urljoin(self._url, link_tag.get("href")) + def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: + return urljoin(url, link_tag.get("href")) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: @@ -162,7 +159,7 @@ class KitIpdCrawler(HttpCrawler): sink.done() - async def get_page(self) -> BeautifulSoup: + async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some # weird comments that beautifulsoup doesn't parse correctly. This @@ -170,4 +167,4 @@ class KitIpdCrawler(HttpCrawler): # cause issues on other pages. content = (await request.read()).decode("utf-8") content = re.sub(r"", "", content) - return soupify(content.encode("utf-8")) + return soupify(content.encode("utf-8")), str(request.url) From 55a2de6b88bbd2ee0cb031271e7045f53caa1702 Mon Sep 17 00:00:00 2001 From: c0derMo Date: Fri, 25 Nov 2022 10:25:22 +0000 Subject: [PATCH 031/147] Fix crawling English opencast --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbefd4..1dc5abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Fixed - IPD crawler unnecessarily appending trailing slashes +- Crawling opencast when ILIAS is set to English ## 3.4.2 - 2022-10-26 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 56dcf7b..c0ebdc9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -366,7 +366,7 @@ class IliasPage: """ # Video start links are marked with an "Abspielen" link video_links: List[Tag] = self._soup.findAll( - name="a", text=re.compile(r"\s*Abspielen\s*") + name="a", text=re.compile(r"\s*(Abspielen|Play)\s*") ) results: List[IliasPageElement] = [] From 6d44aac2783c69031e7686263fc0a2285912376f Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 29 Nov 2022 18:22:19 +0100 Subject: [PATCH 032/147] Bump version to 3.4.3 --- CHANGELOG.md | 2 ++ PFERD/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dc5abc..8793d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ambiguous situations. ## Unreleased +## 3.4.3 - 2022-11-29 + ### Added - Missing documentation for `forums` option diff --git a/PFERD/version.py b/PFERD/version.py index 0ef5d89..7043d78 100644 --- a/PFERD/version.py +++ b/PFERD/version.py @@ -1,2 +1,2 @@ NAME = "PFERD" -VERSION = "3.4.2" +VERSION = "3.4.3" From 722d2eb393913e770aff17da6b5b3b6603d1ee67 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 25 Nov 2022 12:49:36 +0100 Subject: [PATCH 033/147] Fix crawling of courses with preselected timeline tab --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/kit_ilias_html.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8793d43..b1d18cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Crawling of courses with the timeline view as the default tab + ## 3.4.3 - 2022-11-29 ### Added diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index c0ebdc9..44e44d9 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -158,6 +158,8 @@ class IliasPage: if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() + if not self._is_content_tab_selected(): + return self._select_content_page_url() return None def _is_forum_page(self) -> bool: @@ -220,6 +222,27 @@ class IliasPage: link = self._abs_url_from_link(element) return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings") + def _is_content_tab_selected(self) -> bool: + return self._select_content_page_url() is None + + def _select_content_page_url(self) -> Optional[IliasPageElement]: + tab = self._soup.find( + id="tab_view_content", + attrs={"class": lambda x: x is not None and "active" not in x} + ) + # Already selected (or not found) + if not tab: + return None + link = tab.find("a") + if link: + link = self._abs_url_from_link(link) + return IliasPageElement(IliasElementType.FOLDER, link, "select content page") + + _unexpected_html_warning() + log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.") + log.warn_contd("PFERD might not find content on the course's main page.") + return None + def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere From 467fc526e8411d4a5113dbb78747aa119981c476 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:24 +0100 Subject: [PATCH 034/147] Fix crawling of file/video cards --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d18cd..c27059b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab +- Crawling of file and custom opencast cards ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 44e44d9..079cfd6 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -738,7 +738,7 @@ class IliasPage: icon: Tag = card_root.select_one(".il-card-repository-head .icon") - if "opencast" in icon["class"]: + if "opencast" in icon["class"] or "xoct" in icon["class"]: return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE @@ -758,6 +758,8 @@ class IliasPage: return IliasElementType.FOLDER if "svy" in icon["class"]: return IliasElementType.SURVEY + if "file" in icon["class"]: + return IliasElementType.FILE _unexpected_html_warning() log.warn_contd(f"Could not extract type from {icon} for card title {card_title}") From 6f30c6583d6512c92042c581e86027a4341ddc89 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Tue, 21 Mar 2023 23:52:33 +0100 Subject: [PATCH 035/147] Fix crawling of cards without descriptions --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c27059b..7a5f654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards +- Crawling of button cards without descriptions ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 079cfd6..efe6757 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -708,7 +708,11 @@ class IliasPage: "div", attrs={"class": lambda x: x and "caption" in x}, ) - description = caption_parent.find_next_sibling("div").getText().strip() + caption_container = caption_parent.find_next_sibling("div") + if caption_container: + description = caption_container.getText().strip() + else: + description = None if not type: _unexpected_html_warning() From 0294ceb7d5ff074dcc2566872d6b5f64f99c598f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Wed, 22 Mar 2023 00:08:19 +0100 Subject: [PATCH 036/147] Update github action versions --- .github/workflows/build-and-release.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 090ac7e..83a36e4 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -17,9 +17,9 @@ jobs: python: ["3.9"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -45,7 +45,7 @@ jobs: run: mv dist/pferd* dist/pferd-${{ matrix.os }} - name: Upload binary - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Binaries path: dist/pferd-${{ matrix.os }} @@ -57,7 +57,7 @@ jobs: steps: - name: Download binaries - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v3 with: name: Binaries From 443f7fe83913bcb82a42d7b70d4d05df65f05278 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" Date: Sat, 29 Jul 2023 17:54:42 +0200 Subject: [PATCH 037/147] Add `no-delete-prompt-overwrite` crawler conflict resolution option (#75) --- CHANGELOG.md | 3 +++ CONFIG.md | 2 ++ LICENSE | 3 ++- PFERD/output_dir.py | 11 ++++++----- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a5f654..22522e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ ambiguous situations. - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +### Added +- `no-delete-prompt-override` conflict resolution strategy + ## 3.4.3 - 2022-11-29 ### Added diff --git a/CONFIG.md b/CONFIG.md index 640e4af..84ee885 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -75,6 +75,8 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the + remote file is different. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/LICENSE b/LICENSE index fe2293f..d81e827 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright 2019-2021 Garmelon, I-Al-Istannen, danstooamerican, pavelzw, - TheChristophe, Scriptim, thelukasprobst, Toorero + TheChristophe, Scriptim, thelukasprobst, Toorero, + Mr-Pine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index c92f4a6..38d1288 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -44,6 +44,7 @@ class OnConflict(Enum): LOCAL_FIRST = "local-first" REMOTE_FIRST = "remote-first" NO_DELETE = "no-delete" + NO_DELETE_PROMPT_OVERWRITE = "no-delete-prompt-overwrite" @staticmethod def from_string(string: str) -> "OnConflict": @@ -51,7 +52,7 @@ class OnConflict(Enum): return OnConflict(string) except ValueError: raise ValueError("must be one of 'prompt', 'local-first'," - " 'remote-first', 'no-delete'") + " 'remote-first', 'no-delete', 'no-delete-prompt-overwrite'") @dataclass @@ -264,7 +265,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Replace {fmt_path(path)} with remote file?" return await prompt_yes_no(prompt, default=False) @@ -283,7 +284,7 @@ class OutputDirectory: on_conflict: OnConflict, path: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Recursively delete {fmt_path(path)} and replace with remote file?" return await prompt_yes_no(prompt, default=False) @@ -303,7 +304,7 @@ class OutputDirectory: path: PurePath, parent: PurePath, ) -> bool: - if on_conflict == OnConflict.PROMPT: + if on_conflict in {OnConflict.PROMPT, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: async with log.exclusive_output(): prompt = f"Delete {fmt_path(parent)} so remote file {fmt_path(path)} can be downloaded?" return await prompt_yes_no(prompt, default=False) @@ -330,7 +331,7 @@ class OutputDirectory: return False elif on_conflict == OnConflict.REMOTE_FIRST: return True - elif on_conflict == OnConflict.NO_DELETE: + elif on_conflict in {OnConflict.NO_DELETE, OnConflict.NO_DELETE_PROMPT_OVERWRITE}: return False # This should never be reached From d204dac8ced63534ca2b4596e9a63c880b2077a3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Fri, 2 Jun 2023 18:19:39 +0200 Subject: [PATCH 038/147] Detect unexpected root page redirects and abort operation --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 10 ++++++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 20 ++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22522e2..ee55659 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ ambiguous situations. - Crawling of courses with the timeline view as the default tab - Crawling of file and custom opencast cards - Crawling of button cards without descriptions +- Abort crawling when encountering an unexpected ilias root page redirect ### Added - `no-delete-prompt-override` conflict resolution strategy diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index efe6757..aed2069 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -79,6 +79,16 @@ class IliasPage: self._page_type = source_element.type if source_element else None self._source_name = source_element.name if source_element else "" + @staticmethod + def is_root_page(soup: BeautifulSoup) -> bool: + permalink = soup.find(id="current_perma_link") + if permalink is None: + return False + value = permalink.attrs.get("value") + if value is None: + return False + return "goto.php?target=root_" in value + def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index e3719b8..ae49edc 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -239,7 +239,7 @@ instance's greatest bottleneck. # Duplicated code, but the root page is special - we want to avoid fetching it twice! while next_stage_url: - soup = await self._get_page(next_stage_url) + soup = await self._get_page(next_stage_url, root_page_allowed=True) if current_parent is None and expected_id is not None: perma_link_element: Tag = soup.find(id="current_perma_link") @@ -739,12 +739,12 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() - async def _get_page(self, url: str) -> BeautifulSoup: + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) # We weren't authenticated, so try to do that await self.authenticate(auth_id) @@ -753,9 +753,21 @@ instance's greatest bottleneck. async with self.session.get(url) as request: soup = soupify(await request.read()) if self._is_logged_in(soup): - return soup + return self._verify_page(soup, url, root_page_allowed) raise CrawlError("get_page failed even after authenticating") + def _verify_page(self, soup: BeautifulSoup, url: str, root_page_allowed: bool) -> BeautifulSoup: + if IliasPage.is_root_page(soup) and not root_page_allowed: + raise CrawlError( + "Unexpectedly encountered ILIAS root page. " + "This usually happens because the ILIAS instance is broken. " + "If so, wait a day or two and try again. " + "It could also happen because a crawled element links to the ILIAS root page. " + "If so, use a transform with a ! as target to ignore the particular element. " + f"The redirect came from {url}" + ) + return soup + async def _post_authenticated( self, url: str, From 123a57beec37090310f76df3746e6ce107ceb299 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 18:14:57 +0200 Subject: [PATCH 039/147] Fix mypy unreachable error in file_templates --- PFERD/crawl/ilias/file_templates.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 151a41b..59123a2 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -102,24 +102,24 @@ class Links(Enum): INTERNET_SHORTCUT = "internet-shortcut" def template(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return _link_template_fancy - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return _link_template_plain - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return _link_template_internet_shortcut - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") def extension(self) -> Optional[str]: - if self == self.FANCY: + if self == Links.FANCY: return ".html" - elif self == self.PLAINTEXT: + elif self == Links.PLAINTEXT: return ".txt" - elif self == self.INTERNET_SHORTCUT: + elif self == Links.INTERNET_SHORTCUT: return ".url" - elif self == self.IGNORE: + elif self == Links.IGNORE: return None raise ValueError("Missing switch case") From 68c398f1fea5cfefd86d11e79f2f6582d50e6563 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 29 Jul 2023 23:23:10 +0200 Subject: [PATCH 040/147] Add support for ILIAS learning modules --- CHANGELOG.md | 1 + PFERD/crawl/ilias/file_templates.py | 69 +++++++++ PFERD/crawl/ilias/ilias_html_cleaner.py | 2 +- PFERD/crawl/ilias/kit_ilias_html.py | 46 ++++++ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 160 ++++++++++++++++++++- 5 files changed, 272 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee55659..6e3925c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy +- support for ILIAS learning modules ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/file_templates.py b/PFERD/crawl/ilias/file_templates.py index 59123a2..b206461 100644 --- a/PFERD/crawl/ilias/file_templates.py +++ b/PFERD/crawl/ilias/file_templates.py @@ -1,6 +1,10 @@ from enum import Enum from typing import Optional +import bs4 + +from PFERD.utils import soupify + _link_template_plain = "{{link}}" _link_template_fancy = """ @@ -94,6 +98,71 @@ _link_template_internet_shortcut = """ URL={{link}} """.strip() +_learning_module_template = """ + + + + + {{name}} + + + + +{{body}} + + +""" + + +def learning_module_template(body: bs4.Tag, name: str, prev: Optional[str], next: Optional[str]) -> str: + # Seems to be comments, ignore those. + for elem in body.select(".il-copg-mob-fullscreen-modal"): + elem.decompose() + + nav_template = """ + + """ + if prev and body.select_one(".ilc_page_lnav_LeftNavigation"): + text = body.select_one(".ilc_page_lnav_LeftNavigation").getText().strip() + left = f'{text}' + else: + left = "" + + if next and body.select_one(".ilc_page_rnav_RightNavigation"): + text = body.select_one(".ilc_page_rnav_RightNavigation").getText().strip() + right = f'{text}' + else: + right = "" + + if top_nav := body.select_one(".ilc_page_tnav_TopNavigation"): + top_nav.replace_with( + soupify(nav_template.replace("{{left}}", left).replace("{{right}}", right).encode()) + ) + + if bot_nav := body.select_one(".ilc_page_bnav_BottomNavigation"): + bot_nav.replace_with(soupify(nav_template.replace( + "{{left}}", left).replace("{{right}}", right).encode()) + ) + + body = body.prettify() + return _learning_module_template.replace("{{body}}", body).replace("{{name}}", name) + class Links(Enum): IGNORE = "ignore" diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5952309..5495304 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -82,7 +82,7 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: dummy.decompose() if len(children) > 1: continue - if type(children[0]) == Comment: + if isinstance(type(children[0]), Comment): dummy.decompose() for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index aed2069..46a8073 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -22,6 +22,7 @@ class IliasElementType(Enum): FOLDER = "folder" FORUM = "forum" LINK = "link" + LEARNING_MODULE = "learning_module" BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" @@ -71,6 +72,14 @@ class IliasForumThread: mtime: Optional[datetime] +@dataclass +class IliasLearningModulePage: + title: str + content: Tag + next_url: Optional[str] + previous_url: Optional[str] + + class IliasPage: def __init__(self, soup: BeautifulSoup, _page_url: str, source_element: Optional[IliasPageElement]): @@ -136,6 +145,34 @@ class IliasPage: return BeautifulSoup(raw_html, "html.parser") + def get_learning_module_data(self) -> Optional[IliasLearningModulePage]: + if not self._is_learning_module_page(): + return None + content = self._soup.select_one("#ilLMPageContent") + title = self._soup.select_one(".ilc_page_title_PageTitle").getText().strip() + return IliasLearningModulePage( + title=title, + content=content, + next_url=self._find_learning_module_next(), + previous_url=self._find_learning_module_prev() + ) + + def _find_learning_module_next(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_rnavlink_RightNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + + def _find_learning_module_prev(self) -> Optional[str]: + for link in self._soup.select("a.ilc_page_lnavlink_LeftNavigationLink"): + url = self._abs_url_from_link(link) + if "baseClass=ilLMPresentationGUI" not in url: + continue + return url + return None + def get_download_forum_data(self) -> Optional[IliasDownloadForumData]: form = self._soup.find("form", attrs={"action": lambda x: x and "fallbackCmd=showThreads" in x}) if not form: @@ -222,6 +259,12 @@ class IliasPage: return False return "target=copa_" in link.get("value") + def _is_learning_module_page(self) -> bool: + link = self._soup.find(id="current_perma_link") + if not link: + return False + return "target=pg_" in link.get("value") + def _contains_collapsed_future_meetings(self) -> bool: return self._uncollapse_future_meetings_url() is not None @@ -812,6 +855,9 @@ class IliasPage: if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST + if "baseClass=ilLMPresentationGUI" in parsed_url.query: + return IliasElementType.LEARNING_MODULE + # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index ae49edc..f82d684 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -1,8 +1,11 @@ import asyncio +import base64 +import os import re from collections.abc import Awaitable, Coroutine from pathlib import PurePath -from typing import Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import Any, Callable, Dict, List, Literal, Optional, Set, Union, cast +from urllib.parse import urljoin import aiohttp import yarl @@ -16,10 +19,10 @@ from ...output_dir import FileSink, Redownload from ...utils import fmt_path, soupify, url_set_query_param from ..crawler import AWrapped, CrawlError, CrawlToken, CrawlWarning, DownloadToken, anoncritical from ..http_crawler import HttpCrawler, HttpCrawlerSection -from .file_templates import Links +from .file_templates import Links, learning_module_template from .ilias_html_cleaner import clean, insert_base_markup -from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, IliasPageElement, - _sanitize_path_name, parse_ilias_forum_export) +from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasLearningModulePage, IliasPage, + IliasPageElement, _sanitize_path_name, parse_ilias_forum_export) TargetType = Union[str, int] @@ -394,6 +397,8 @@ instance's greatest bottleneck. "[bright_black](surveys contain no relevant data)" ) return None + elif element.type == IliasElementType.LEARNING_MODULE: + return await self._handle_learning_module(element, element_path) elif element.type == IliasElementType.LINK: return await self._handle_link(element, element_path) elif element.type == IliasElementType.BOOKING: @@ -739,6 +744,135 @@ instance's greatest bottleneck. sink.file.write(content.encode("utf-8")) sink.done() + async def _handle_learning_module( + self, + element: IliasPageElement, + element_path: PurePath, + ) -> Optional[Coroutine[Any, Any, None]]: + maybe_cl = await self.crawl(element_path) + if not maybe_cl: + return None + return self._crawl_learning_module(element, maybe_cl) + + @_iorepeat(3, "crawling learning module") + @anoncritical + async def _crawl_learning_module(self, element: IliasPageElement, cl: CrawlToken) -> None: + elements: List[IliasLearningModulePage] = [] + + async with cl: + log.explain_topic(f"Parsing initial HTML page for {fmt_path(cl.path)}") + log.explain(f"URL: {element.url}") + soup = await self._get_page(element.url) + page = IliasPage(soup, element.url, None) + if next := page.get_learning_module_data(): + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.previous_url, "left" + )) + elements.append(next) + elements.extend(await self._crawl_learning_module_direction( + cl.path, next.next_url, "right" + )) + + # Reflect their natural ordering in the file names + for index, lm_element in enumerate(elements): + lm_element.title = f"{index:02}_{lm_element.title}" + + tasks: List[Awaitable[None]] = [] + for index, elem in enumerate(elements): + prev_url = elements[index - 1].title if index > 0 else None + next_url = elements[index + 1].title if index < len(elements) - 1 else None + tasks.append(asyncio.create_task( + self._download_learning_module_page(cl.path, elem, prev_url, next_url) + )) + + # And execute them + await self.gather(tasks) + + async def _crawl_learning_module_direction( + self, + path: PurePath, + start_url: Optional[str], + dir: Union[Literal["left"], Literal["right"]] + ) -> List[IliasLearningModulePage]: + elements: List[IliasLearningModulePage] = [] + + if not start_url: + return elements + + next_element_url: Optional[str] = start_url + counter = 0 + while next_element_url: + log.explain_topic(f"Parsing HTML page for {fmt_path(path)} ({dir}-{counter})") + log.explain(f"URL: {next_element_url}") + soup = await self._get_page(next_element_url) + page = IliasPage(soup, next_element_url, None) + if next := page.get_learning_module_data(): + elements.append(next) + if dir == "left": + next_element_url = next.previous_url + else: + next_element_url = next.next_url + counter += 1 + + return elements + + @anoncritical + @_iorepeat(3, "saving learning module page") + async def _download_learning_module_page( + self, + parent_path: PurePath, + element: IliasLearningModulePage, + prev: Optional[str], + next: Optional[str] + ) -> None: + path = parent_path / (_sanitize_path_name(element.title) + ".html") + maybe_dl = await self.download(path) + if not maybe_dl: + return + my_path = self._transformer.transform(maybe_dl.path) + if not my_path: + return + + if prev: + prev_p = self._transformer.transform(parent_path / (_sanitize_path_name(prev) + ".html")) + if prev_p: + prev = os.path.relpath(prev_p, my_path.parent) + else: + prev = None + if next: + next_p = self._transformer.transform(parent_path / (_sanitize_path_name(next) + ".html")) + if next_p: + next = os.path.relpath(next_p, my_path.parent) + else: + next = None + + async with maybe_dl as (bar, sink): + content = element.content + content = await self.internalize_images(content) + sink.file.write(learning_module_template(content, maybe_dl.path.name, prev, next).encode("utf-8")) + sink.done() + + async def internalize_images(self, tag: Tag) -> Tag: + """ + Tries to fetch ILIAS images and embed them as base64 data. + """ + log.explain_topic("Internalizing images") + for elem in tag.find_all(recursive=True): + if not isinstance(elem, Tag): + continue + if elem.name == "img": + if src := elem.attrs.get("src", None): + url = urljoin(_ILIAS_URL, src) + if not url.startswith(_ILIAS_URL): + continue + log.explain(f"Internalizing {url!r}") + img = await self._get_authenticated(url) + elem.attrs["src"] = "data:;base64," + base64.b64encode(img).decode() + if elem.name == "iframe" and elem.attrs.get("src", "").startswith("//"): + # For unknown reasons the protocol seems to be stripped. + elem.attrs["src"] = "https:" + elem.attrs["src"] + return tag + async def _get_page(self, url: str, root_page_allowed: bool = False) -> BeautifulSoup: auth_id = await self._current_auth_id() async with self.session.get(url) as request: @@ -772,7 +906,7 @@ instance's greatest bottleneck. self, url: str, data: dict[str, Union[str, List[str]]] - ) -> BeautifulSoup: + ) -> bytes: auth_id = await self._current_auth_id() form_data = aiohttp.FormData() @@ -792,6 +926,22 @@ instance's greatest bottleneck. return await request.read() raise CrawlError("post_authenticated failed even after authenticating") + async def _get_authenticated(self, url: str) -> bytes: + auth_id = await self._current_auth_id() + + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + + # We weren't authenticated, so try to do that + await self.authenticate(auth_id) + + # Retry once after authenticating. If this fails, we will die. + async with self.session.get(url, allow_redirects=False) as request: + if request.status == 200: + return await request.read() + raise CrawlError("get_authenticated failed even after authenticating") + # We repeat this as the login method in shibboleth doesn't handle I/O errors. # Shibboleth is quite reliable as well, the repeat is likely not critical here. @ _iorepeat(3, "Login", failure_is_error=True) From dbc2553b119c39c7a8ad196c6858fc8109f746a9 Mon Sep 17 00:00:00 2001 From: "Mr. Pine" <50425705+Mr-Pine@users.noreply.github.com> Date: Wed, 15 Mar 2023 15:33:42 +0100 Subject: [PATCH 041/147] Add default `show-not-deleted` option If set to `no`, PFERD won't print status or report messages for not deleted files --- CHANGELOG.md | 3 +++ CONFIG.md | 8 ++++++-- PFERD/__main__.py | 4 ++++ PFERD/cli/parser.py | 7 +++++++ PFERD/config.py | 3 +++ PFERD/logging.py | 20 ++++++++++++++++++++ PFERD/output_dir.py | 2 +- PFERD/pferd.py | 2 +- 8 files changed, 45 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e3925c..85513d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,9 @@ ambiguous situations. ### Added - `no-delete-prompt-override` conflict resolution strategy - support for ILIAS learning modules +- `show_not_deleted` option to stop printing the "Not Deleted" status or report + message. This combines nicely with the `no-delete-prompt-override` strategy, + causing PFERD to mostly ignore local-only files. ## 3.4.3 - 2022-11-29 diff --git a/CONFIG.md b/CONFIG.md index 84ee885..5f62749 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -26,6 +26,9 @@ default values for the other sections. `Added ...`) while running a crawler. (Default: `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `show_not_deleted`: Whether PFERD should print messages in status and report + when a local-only file wasn't deleted. Combines nicely with the + `no-delete-prompt-override` conflict resolution strategy. - `share_cookies`: Whether crawlers should share cookies where applicable. For example, some crawlers share cookies if they crawl the same website using the same account. (Default: `yes`) @@ -75,8 +78,9 @@ common to all crawlers: using `prompt` and always choosing "yes". - `no-delete`: Never delete local files, but overwrite local files if the remote file is different. - - `no-delete-prompt-overwrite`: Never delete local files, but prompt to overwrite local files if the - remote file is different. + - `no-delete-prompt-overwrite`: Never delete local files, but prompt to + overwrite local files if the remote file is different. Combines nicely + with the `show_not_deleted` option. - `transform`: Rules for renaming and excluding certain files and directories. For more details, see [this section](#transformation-rules). (Default: empty) - `tasks`: The maximum number of concurrent tasks (such as crawling or diff --git a/PFERD/__main__.py b/PFERD/__main__.py index 4faeb13..cb8c67c 100644 --- a/PFERD/__main__.py +++ b/PFERD/__main__.py @@ -47,6 +47,8 @@ def configure_logging_from_args(args: argparse.Namespace) -> None: log.output_explain = args.explain if args.status is not None: log.output_status = args.status + if args.show_not_deleted is not None: + log.output_not_deleted = args.show_not_deleted if args.report is not None: log.output_report = args.report @@ -72,6 +74,8 @@ def configure_logging_from_config(args: argparse.Namespace, config: Config) -> N log.output_status = config.default_section.status() if args.report is None: log.output_report = config.default_section.report() + if args.show_not_deleted is None: + log.output_not_deleted = config.default_section.show_not_deleted() except ConfigOptionError as e: log.error(str(e)) sys.exit(1) diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index e753023..be483fd 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -215,6 +215,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="whether crawlers should share cookies where applicable" ) +PARSER.add_argument( + "--show-not-deleted", + action=BooleanOptionalAction, + help="print messages in status and report when PFERD did not delete a local only file" +) def load_default_section( @@ -233,6 +238,8 @@ def load_default_section( section["report"] = "yes" if args.report else "no" if args.share_cookies is not None: section["share_cookies"] = "yes" if args.share_cookies else "no" + if args.show_not_deleted is not None: + section["show_not_deleted"] = "yes" if args.show_not_deleted else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 8f7e682..b2cff4e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -82,6 +82,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def show_not_deleted(self) -> bool: + return self.s.getboolean("show_not_deleted", fallback=True) + def share_cookies(self) -> bool: return self.s.getboolean("share_cookies", fallback=True) diff --git a/PFERD/logging.py b/PFERD/logging.py index 340b21f..b958fb2 100644 --- a/PFERD/logging.py +++ b/PFERD/logging.py @@ -59,6 +59,7 @@ class Log: # Whether different parts of the output are enabled or disabled self.output_explain = False self.output_status = True + self.output_not_deleted = True self.output_report = True def _update_live(self) -> None: @@ -207,6 +208,17 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new action = escape(f"{action:<{self.STATUS_WIDTH}}") self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def not_deleted(self, style: str, action: str, text: str, suffix: str = "") -> None: + """ + Print a message for a local only file that wasn't + deleted while crawling. Allows markup in the "style" + argument which will be applied to the "action" string. + """ + + if self.output_status and self.output_not_deleted: + action = escape(f"{action:<{self.STATUS_WIDTH}}") + self.print(f"{style}{action}[/] {escape(text)} {suffix}") + def report(self, text: str) -> None: """ Print a report after crawling. Allows markup. @@ -215,6 +227,14 @@ directly or as a GitHub issue: https://github.com/Garmelon/PFERD/issues/new if self.output_report: self.print(text) + def report_not_deleted(self, text: str) -> None: + """ + Print a report for a local only file that wasn't deleted after crawling. Allows markup. + """ + + if self.output_report and self.output_not_deleted: + self.print(text) + @contextmanager def _bar( self, diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 38d1288..e9e9b93 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -496,7 +496,7 @@ class OutputDirectory: except OSError: pass else: - log.status("[bold bright_magenta]", "Not deleted", fmt_path(pure)) + log.not_deleted("[bold bright_magenta]", "Not deleted", fmt_path(pure)) self._report.not_delete_file(pure) def load_prev_report(self) -> None: diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 079053b..b30a04a 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -180,7 +180,7 @@ class Pferd: log.report(f" [bold bright_magenta]Deleted[/] {fmt_path(path)}") for path in sorted(crawler.report.not_deleted_files): something_changed = True - log.report(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") + log.report_not_deleted(f" [bold bright_magenta]Not deleted[/] {fmt_path(path)}") for warning in crawler.report.encountered_warnings: something_changed = True From b3d412360baeed6992535e6957d0bc1e368c337f Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 23:48:14 +0200 Subject: [PATCH 042/147] Add Nix flake --- flake.lock | 27 +++++++++++++++++++++++++++ flake.nix | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..914c58b --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1692986144, + "narHash": "sha256-M4VFpy7Av9j+33HF5nIGm0k2+DXXW4qSSKdidIKg5jY=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "74e5bdc5478ebbe7ba5849f0d765f92757bb9dbf", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-23.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..e3d52af --- /dev/null +++ b/flake.nix @@ -0,0 +1,41 @@ +{ + description = "Tool for downloading course-related files from ILIAS"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.05"; + }; + + outputs = { self, nixpkgs }: + let + # Helper function to generate an attrset '{ x86_64-linux = f "x86_64-linux"; ... }'. + forAllSystems = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed; + in + { + packages = forAllSystems (system: + let pkgs = import nixpkgs { inherit system; }; + in + rec { + default = pkgs.python3Packages.buildPythonApplication rec { + pname = "pferd"; + # Performing black magic + # Don't worry, I sacrificed enough goats for the next few years + version = (pkgs.lib.importTOML ./PFERD/version.py).VERSION; + format = "pyproject"; + + src = ./.; + + nativeBuildInputs = with pkgs.python3Packages; [ + setuptools + ]; + + propagatedBuildInputs = with pkgs.python3Packages; [ + aiohttp + beautifulsoup4 + rich + keyring + certifi + ]; + }; + }); + }; +} From 2184ac804018e836e439e365ae2b0d184adae26d Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 26 Aug 2023 19:39:40 +0200 Subject: [PATCH 043/147] Add support for ILIAS mediacast listings --- CHANGELOG.md | 1 + PFERD/crawl/ilias/kit_ilias_html.py | 110 +++++++++++++++------ PFERD/crawl/ilias/kit_ilias_web_crawler.py | 45 +++++---- 3 files changed, 107 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85513d2..d58ea18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ambiguous situations. - `show_not_deleted` option to stop printing the "Not Deleted" status or report message. This combines nicely with the `no-delete-prompt-override` strategy, causing PFERD to mostly ignore local-only files. +- support for mediacast video listings ## 3.4.3 - 2022-11-29 diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 46a8073..d5ea76d 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from datetime import date, datetime, timedelta from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, cast from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup, Tag @@ -26,10 +26,12 @@ class IliasElementType(Enum): BOOKING = "booking" MEETING = "meeting" SURVEY = "survey" - VIDEO = "video" - VIDEO_PLAYER = "video_player" - VIDEO_FOLDER = "video_folder" - VIDEO_FOLDER_MAYBE_PAGINATED = "video_folder_maybe_paginated" + MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + MEDIACAST_VIDEO = "mediacast_video" + OPENCAST_VIDEO = "opencast_video" + OPENCAST_VIDEO_PLAYER = "opencast_video_player" + OPENCAST_VIDEO_FOLDER = "opencast_video_folder" + OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @dataclass @@ -45,7 +47,8 @@ class IliasPageElement: r"eid=(?P[0-9a-z\-]+)", r"file_(?P\d+)", r"ref_id=(?P\d+)", - r"target=[a-z]+_(?P\d+)" + r"target=[a-z]+_(?P\d+)", + r"mm_(?P\d+)" ] for regex in regexes: @@ -105,9 +108,9 @@ class IliasPage: if self._is_video_player(): log.explain("Page is a video player, extracting URL") return self._player_to_video() - if self._is_video_listing(): - log.explain("Page is a video listing, searching for elements") - return self._find_video_entries() + if self._is_opencast_video_listing(): + log.explain("Page is an opencast video listing, searching for elements") + return self._find_opencast_video_entries() if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() @@ -199,9 +202,9 @@ class IliasPage: if self._is_ilias_opencast_embedding(): log.explain("Unwrapping opencast embedding") return self.get_child_elements()[0] - if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED: + if self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED: log.explain("Unwrapping video pagination") - return self._find_video_entries_paginated()[0] + return self._find_opencast_video_entries_paginated()[0] if self._contains_collapsed_future_meetings(): log.explain("Requesting *all* future meetings") return self._uncollapse_future_meetings_url() @@ -219,7 +222,7 @@ class IliasPage: def _is_video_player(self) -> bool: return "paella_config_file" in str(self._soup) - def _is_video_listing(self) -> bool: + def _is_opencast_video_listing(self) -> bool: if self._is_ilias_opencast_embedding(): return True @@ -319,14 +322,14 @@ class IliasPage: # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] - return [IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name)] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, self._source_name)] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] - items.append(IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) + items.append(IliasPageElement(IliasElementType.OPENCAST_VIDEO, video_url, full_name)) return items @@ -385,7 +388,7 @@ class IliasPage: return items - def _find_video_entries(self) -> List[IliasPageElement]: + def _find_opencast_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated @@ -405,27 +408,27 @@ class IliasPage: query_params = {"limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} url = url_set_query_params(url, query_params) log.explain("Found ILIAS video frame page, fetching actual content next") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, url, "")] is_paginated = self._soup.find(id=re.compile(r"tab_page_sel.+")) is not None - if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: + if is_paginated and not self._page_type == IliasElementType.OPENCAST_VIDEO_FOLDER: # We are in stage 2 - try to break pagination - return self._find_video_entries_paginated() + return self._find_opencast_video_entries_paginated() - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() - def _find_video_entries_paginated(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn("Couldn't increase elements per page (table not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn("Couldn't increase elements per page (table id not found). I might miss elements.") - return self._find_video_entries_no_paging() + return self._find_opencast_video_entries_no_paging() table_id = id_match.group(1) @@ -434,9 +437,9 @@ class IliasPage: url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") - return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")] + return [IliasPageElement(IliasElementType.OPENCAST_VIDEO_FOLDER, url, "")] - def _find_video_entries_no_paging(self) -> List[IliasPageElement]: + def _find_opencast_video_entries_no_paging(self) -> List[IliasPageElement]: """ Crawls the "second stage" video page. This page contains the actual video urls. """ @@ -448,11 +451,11 @@ class IliasPage: results: List[IliasPageElement] = [] for link in video_links: - results.append(self._listed_video_to_element(link)) + results.append(self._listed_opencast_video_to_element(link)) return results - def _listed_video_to_element(self, link: Tag) -> IliasPageElement: + def _listed_opencast_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date @@ -479,7 +482,9 @@ class IliasPage: video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") - return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time) + return IliasPageElement( + IliasElementType.OPENCAST_VIDEO_PLAYER, video_url, video_name, modification_time + ) def _find_exercise_entries(self) -> List[IliasPageElement]: if self._soup.find(id="tab_submission"): @@ -622,9 +627,48 @@ class IliasPage: result.append(IliasPageElement(element_type, abs_url, element_name, description=description)) result += self._find_cards() + result += self._find_mediacast_videos() return result + def _find_mediacast_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for elem in cast(List[Tag], self._soup.select(".ilPlayerPreviewOverlayOuter")): + element_name = _sanitize_path_name( + elem.select_one(".ilPlayerPreviewDescription").getText().strip() + ) + if not element_name.endswith(".mp4"): + # just to make sure it has some kinda-alrightish ending + element_name = element_name + ".mp4" + video_element = elem.find(name="video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No