From c54c3bcfa157631af1d55a210b60ad3bfc64f972 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 27 Oct 2024 10:50:59 +0100 Subject: [PATCH 01/59] Fix crawling of favorites --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_web_crawler.py | 9 +++------ PFERD/crawl/ilias/kit_ilias_html.py | 9 +++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 573cad9..ce20269 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Fixed +- Personal desktop/dashboard/favorites crawling + ## 3.6.0 - 2024-10-23 ### Added diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index b77f4fc..a566ce5 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -185,12 +185,9 @@ instance's greatest bottleneck. await self._crawl_url(root_url, expected_id=course_id) async def _crawl_desktop(self) -> None: - appendix = r"ILIAS\Repository\Provider\RepositoryMainBarProvider|mm_pd_sel_items" - appendix = appendix.encode("ASCII").hex() - await self._crawl_url(url_set_query_param( - urljoin(self._base_url, "/gs_content.php"), - "item=", appendix, - )) + await self._crawl_url( + urljoin(self._base_url, "/ilias.php?baseClass=ilDashboardGUI&cmd=show") + ) async def _crawl_url(self, url: str, expected_id: Optional[int] = None) -> None: maybe_cl = await self.crawl(PurePath(".")) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 34e02ba..98b32c3 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -322,7 +322,7 @@ class IliasPage: return False def _is_personal_desktop(self) -> bool: - return self._soup.find("a", attrs={"href": lambda x: x and "block_type=pditems" in x}) + return "baseclass=ildashboardgui" in self._page_url.lower() and "&cmd=show" in self._page_url.lower() def _is_content_page(self) -> bool: if link := self.get_permalink(): @@ -427,9 +427,14 @@ class IliasPage: def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] - titles: List[Tag] = self._soup.select(".il-item-title") + titles: List[Tag] = self._soup.select("#block_pditems_0 .il-item-title") for title in titles: link = title.find("a") + + if not link: + log.explain(f"Skipping offline item: {title.getText().strip()!r}") + continue + name = _sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link) From 739dd958500349dfc54f6a8370a10b122b1e1bee Mon Sep 17 00:00:00 2001 From: Tim Date: Sun, 27 Oct 2024 19:03:47 +0100 Subject: [PATCH 02/59] Use Last-Modified and ETag headers to determine KIT-IPD file versions (#95) Co-authored-by: I-Al-Istannen --- PFERD/crawl/crawler.py | 11 ++++++- PFERD/crawl/http_crawler.py | 52 +++++++++++++++++++++++++++++++++- PFERD/crawl/kit_ipd_crawler.py | 36 +++++++++++++++++++---- PFERD/output_dir.py | 15 ++++++++-- 4 files changed, 104 insertions(+), 10 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 0e67c02..dd500e6 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -293,6 +293,8 @@ class Crawler(ABC): async def download( self, path: PurePath, + *, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -307,7 +309,14 @@ class Crawler(ABC): log.status("[bold bright_black]", "Ignored", fmt_path(path)) return None - fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) + fs_token = await self._output_dir.download( + path, + transformed_path, + etag_differs=etag_differs, + mtime=mtime, + redownload=redownload, + on_conflict=on_conflict + ) if fs_token is None: log.explain("Answer: No") return None diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 44ec4dd..39b22f3 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,8 +1,9 @@ import asyncio import http.cookies import ssl +from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import aiohttp import certifi @@ -15,6 +16,8 @@ from ..utils import fmt_real_path from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection +ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" + class HttpCrawlerSection(CrawlerSection): def http_timeout(self) -> float: @@ -169,6 +172,53 @@ class HttpCrawler(Crawler): log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(str(e)) + def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: + """ + If available, retrieves the entity tag for a given path which was stored in the previous report. + """ + if not self._output_dir.prev_report: + return None + + etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + return etags.get(str(path)) + + def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: + """ + Adds an entity tag for a given path to the report's custom values. + """ + if not etag: + return + + etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + etags[str(path)] = etag + self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) + + async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: + """ + Requests the ETag and Last-Modified headers of a resource via a HEAD request. + If no entity tag / modification date can be obtained, the according value will be None. + """ + try: + async with self.session.head(resource_url) as resp: + if resp.status != 200: + return None, None + + etag_header = resp.headers.get("ETag") + last_modified_header = resp.headers.get("Last-Modified") + + if last_modified_header: + try: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives + datetime_format = "%a, %d %b %Y %H:%M:%S GMT" + last_modified = datetime.strptime(last_modified_header, datetime_format) + except ValueError: + # last_modified remains None + pass + + return etag_header, last_modified + except aiohttp.ClientError: + return None, None + async def run(self) -> None: self._request_count = 0 self._cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index c852be0..d9515e2 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,6 +1,7 @@ import os import re from dataclasses import dataclass +from datetime import datetime from pathlib import PurePath from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin @@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler): if isinstance(item, KitIpdFolder): tasks.append(self._crawl_folder(item)) else: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the + # crawl abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(item.url) # Orphan files are placed in the root folder - tasks.append(self._download_file(PurePath("."), item)) + tasks.append(self._download_file(PurePath("."), item, etag, mtime)) await self.gather(tasks) @@ -85,18 +89,36 @@ class KitIpdCrawler(HttpCrawler): if not await self.crawl(path): return - tasks = [self._download_file(path, file) for file in folder.files] + tasks = [] + for file in folder.files: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl + # abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(file.url) + tasks.append(self._download_file(path, file, etag, mtime)) await self.gather(tasks) - async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + async def _download_file( + self, + parent: PurePath, + file: KitIpdFile, + etag: Optional[str], + mtime: Optional[datetime] + ) -> None: element_path = parent / file.name - maybe_dl = await self.download(element_path) + + prev_etag = self._get_previous_etag_from_report(element_path) + etag_differs = None if prev_etag is None else prev_etag != etag + + maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) if not maybe_dl: + # keep storing the known file's etag + if prev_etag: + self._add_etag_to_report(element_path, prev_etag) return async with maybe_dl as (bar, sink): - await self._stream_from_url(file.url, sink, bar) + await self._stream_from_url(file.url, element_path, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: page, url = await self.get_page() @@ -146,7 +168,7 @@ class KitIpdCrawler(HttpCrawler): def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: return urljoin(url, link_tag.get("href")) - async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: if resp.status == 403: raise CrawlError("Received a 403. Are you within the KIT network/VPN?") @@ -159,6 +181,8 @@ class KitIpdCrawler(HttpCrawler): sink.done() + self._add_etag_to_report(path, resp.headers.get("ETag")) + async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index e9e9b93..09cf133 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -57,6 +57,7 @@ class OnConflict(Enum): @dataclass class Heuristics: + etag_differs: Optional[bool] mtime: Optional[datetime] @@ -233,8 +234,16 @@ class OutputDirectory: remote_newer = None + # ETag should be a more reliable indicator than mtime, so we check it first + if heuristics.etag_differs is not None: + remote_newer = heuristics.etag_differs + if remote_newer: + log.explain("Remote file's entity tag differs") + else: + log.explain("Remote file's entity tag is the same") + # Python on Windows crashes when faced with timestamps around the unix epoch - if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): + if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: @@ -366,6 +375,8 @@ class OutputDirectory: self, remote_path: PurePath, path: PurePath, + *, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -375,7 +386,7 @@ class OutputDirectory: MarkConflictError. """ - heuristics = Heuristics(mtime) + heuristics = Heuristics(etag_differs, mtime) redownload = self._redownload if redownload is None else redownload on_conflict = self._on_conflict if on_conflict is None else on_conflict local_path = self.resolve(path) From 8fbd1978affb059f79bab374030afa139b341a6c Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 18:52:09 +0100 Subject: [PATCH 03/59] Fix crawling of nested courses --- CHANGELOG.md | 1 + PFERD/crawl/ilias/ilias_web_crawler.py | 7 ++++--- PFERD/crawl/ilias/kit_ilias_html.py | 19 +++++++++++-------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce20269..3ee3f43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ ambiguous situations. ### Fixed - Personal desktop/dashboard/favorites crawling +- Crawling of nested courses ## 3.6.0 - 2024-10-23 diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index a566ce5..1ff4910 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -81,23 +81,24 @@ class IliasWebCrawlerSection(HttpCrawlerSection): _DIRECTORY_PAGES: Set[IliasElementType] = { + IliasElementType.COURSE, IliasElementType.EXERCISE, IliasElementType.EXERCISE_FILES, IliasElementType.FOLDER, IliasElementType.INFO_TAB, - IliasElementType.MEETING, IliasElementType.MEDIACAST_VIDEO_FOLDER, + IliasElementType.MEETING, IliasElementType.OPENCAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, } _VIDEO_ELEMENTS: Set[IliasElementType] = { - IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.MEDIACAST_VIDEO, + IliasElementType.MEDIACAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO, - IliasElementType.OPENCAST_VIDEO_PLAYER, IliasElementType.OPENCAST_VIDEO_FOLDER, IliasElementType.OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED, + IliasElementType.OPENCAST_VIDEO_PLAYER, } diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 98b32c3..31107cf 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -15,25 +15,26 @@ TargetType = Union[str, int] class IliasElementType(Enum): + BOOKING = "booking" + COURSE = "course" EXERCISE = "exercise" EXERCISE_FILES = "exercise_files" # own submitted files - TEST = "test" # an online test. Will be ignored currently. FILE = "file" FOLDER = "folder" FORUM = "forum" - LINK = "link" INFO_TAB = "info_tab" LEARNING_MODULE = "learning_module" - BOOKING = "booking" - MEETING = "meeting" - SURVEY = "survey" - SCORM_LEARNING_MODULE = "scorm_learning_module" - MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + LINK = "link" MEDIACAST_VIDEO = "mediacast_video" + MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" + MEETING = "meeting" OPENCAST_VIDEO = "opencast_video" - OPENCAST_VIDEO_PLAYER = "opencast_video_player" OPENCAST_VIDEO_FOLDER = "opencast_video_folder" OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" + OPENCAST_VIDEO_PLAYER = "opencast_video_player" + SCORM_LEARNING_MODULE = "scorm_learning_module" + SURVEY = "survey" + TEST = "test" # an online test. Will be ignored currently. @dataclass @@ -968,6 +969,8 @@ class IliasPage: return IliasElementType.LINK if "book" in icon["class"]: return IliasElementType.BOOKING + if "crsr" in icon["class"]: + return IliasElementType.COURSE if "frm" in icon["class"]: return IliasElementType.FORUM if "sess" in icon["class"]: From c1046498e7ff6ab054c65db4a133af6e53e93f03 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 19:15:40 +0100 Subject: [PATCH 04/59] Fix download of links without a target URL They are now downloaded as links to the empty url. --- CHANGELOG.md | 1 + PFERD/crawl/ilias/ilias_web_crawler.py | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ee3f43..8bc6f06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ ambiguous situations. ### Fixed - Personal desktop/dashboard/favorites crawling - Crawling of nested courses +- Downloading of links with no target URL ## 3.6.0 - 2024-10-23 diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 1ff4910..8fbd90f 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -491,17 +491,26 @@ instance's greatest bottleneck. self._write_link_content(link_template, element.url, element.name, element.description, sink) async def _resolve_link_target(self, export_url: str) -> str: - async with self.session.get(export_url, allow_redirects=False) as resp: - # No redirect means we were authenticated - if hdrs.LOCATION not in resp.headers: - return soupify(await resp.read()).select_one("a").get("href").strip() + async def impl() -> Optional[str]: + async with self.session.get(export_url, allow_redirects=False) as resp: + # No redirect means we were authenticated + if hdrs.LOCATION not in resp.headers: + return soupify(await resp.read()).select_one("a").get("href").strip() + # We are either unauthenticated or the link is not active + new_url = resp.headers[hdrs.LOCATION].lower() + if "baseclass=illinkresourcehandlergui" in new_url and "cmd=infoscreen" in new_url: + return "" + return None + + target = await impl() + if target is not None: + return target await self._authenticate() - async with self.session.get(export_url, allow_redirects=False) as resp: - # No redirect means we were authenticated - if hdrs.LOCATION not in resp.headers: - return soupify(await resp.read()).select_one("a").get("href").strip() + target = await impl() + if target is not None: + return target raise CrawlError("resolve_link_target failed even after authenticating") From 71c65e89d178cde2e2a625d078eba713139a3601 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 19:31:50 +0100 Subject: [PATCH 05/59] Internalize images in course descriptions --- PFERD/crawl/ilias/ilias_web_crawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 8fbd90f..08add07 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -476,6 +476,7 @@ instance's greatest bottleneck. async with dl as (bar, sink): description = clean(insert_base_markup(description)) + description = await self.internalize_images(description) sink.file.write(description.prettify().encode("utf-8")) sink.done() From d7a2b6e019a994a9e18e00cffe14da2db763e025 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 19:32:16 +0100 Subject: [PATCH 06/59] Delete videos from course descriptions --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_html_cleaner.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc6f06..f635719 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Changed +- Remove videos from description pages + ### Fixed - Personal desktop/dashboard/favorites crawling - Crawling of nested courses diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 5495304..0075784 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -85,6 +85,11 @@ def clean(soup: BeautifulSoup) -> BeautifulSoup: if isinstance(type(children[0]), Comment): dummy.decompose() + # Delete video figures, as they can not be internalized anyway + for video in soup.select(".ilc_media_cont_MediaContainerHighlighted .ilPageVideo"): + if figure := video.find_parent("figure"): + figure.decompose() + for hrule_imposter in soup.find_all(class_="ilc_section_Separator"): hrule_imposter.insert(0, soup.new_tag("hr")) From 81d6ff53c43f0ed7cc49f66c5505f36c0bf0f1b3 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 19:34:45 +0100 Subject: [PATCH 07/59] Respect row flex in descriptions --- CHANGELOG.md | 1 + PFERD/crawl/ilias/ilias_html_cleaner.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f635719..e14f785 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ ambiguous situations. - Personal desktop/dashboard/favorites crawling - Crawling of nested courses - Downloading of links with no target URL +- Handle row flex on description pages ## 3.6.0 - 2024-10-23 diff --git a/PFERD/crawl/ilias/ilias_html_cleaner.py b/PFERD/crawl/ilias/ilias_html_cleaner.py index 0075784..e82906f 100644 --- a/PFERD/crawl/ilias/ilias_html_cleaner.py +++ b/PFERD/crawl/ilias/ilias_html_cleaner.py @@ -12,6 +12,13 @@ _STYLE_TAG_CONTENT = """ font-weight: bold; } + .row-flex { + display: flex; + } + .row-flex-wrap { + flex-wrap: wrap; + } + .accordion-head { background-color: #f5f7fa; padding: 0.5rem 0; From fa71a9f44fe11a367a396b0cd80b745fe7ef6fe8 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Mon, 28 Oct 2024 20:15:55 +0100 Subject: [PATCH 08/59] Add support for mob videos in page descriptions --- CHANGELOG.md | 3 +++ PFERD/crawl/ilias/ilias_web_crawler.py | 16 ++++++++++--- PFERD/crawl/ilias/kit_ilias_html.py | 33 ++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e14f785..d9431bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ ambiguous situations. ## Unreleased +### Added +- Support for MOB videos in page descriptions + ### Changed - Remove videos from description pages diff --git a/PFERD/crawl/ilias/ilias_web_crawler.py b/PFERD/crawl/ilias/ilias_web_crawler.py index 08add07..73fed9c 100644 --- a/PFERD/crawl/ilias/ilias_web_crawler.py +++ b/PFERD/crawl/ilias/ilias_web_crawler.py @@ -389,6 +389,8 @@ instance's greatest bottleneck. return await self._handle_opencast_video(element, element_path) elif element.type == IliasElementType.MEDIACAST_VIDEO: return await self._handle_file(element, element_path) + elif element.type == IliasElementType.MOB_VIDEO: + return await self._handle_file(element, element_path, is_video=True) elif element.type in _DIRECTORY_PAGES: return await self._handle_ilias_page(element.url, element, element_path) else: @@ -631,18 +633,19 @@ instance's greatest bottleneck. self, element: IliasPageElement, element_path: PurePath, + is_video: bool = False, ) -> Optional[Coroutine[Any, Any, None]]: maybe_dl = await self.download(element_path, mtime=element.mtime) if not maybe_dl: return None - return self._download_file(element, maybe_dl) + return self._download_file(element, maybe_dl, is_video) @_iorepeat(3, "downloading file") @anoncritical - async def _download_file(self, element: IliasPageElement, dl: DownloadToken) -> None: + async def _download_file(self, element: IliasPageElement, dl: DownloadToken, is_video: bool) -> None: assert dl # The function is only reached when dl is not None async with dl as (bar, sink): - await self._stream_from_url(element.url, sink, bar, is_video=False) + await self._stream_from_url(element.url, sink, bar, is_video) async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar, is_video: bool) -> None: async def try_stream() -> bool: @@ -671,6 +674,13 @@ instance's greatest bottleneck. if is_video and "html" in resp.content_type: return False + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Range + if content_range := resp.headers.get(hdrs.CONTENT_RANGE, default=None): + parts = content_range.split("/") + if len(parts) == 2 and parts[1].isdigit(): + bar.set_total(int(parts[1])) + + # Prefer the content length header if resp.content_length: bar.set_total(resp.content_length) diff --git a/PFERD/crawl/ilias/kit_ilias_html.py b/PFERD/crawl/ilias/kit_ilias_html.py index 31107cf..e0c87ad 100644 --- a/PFERD/crawl/ilias/kit_ilias_html.py +++ b/PFERD/crawl/ilias/kit_ilias_html.py @@ -28,6 +28,7 @@ class IliasElementType(Enum): MEDIACAST_VIDEO = "mediacast_video" MEDIACAST_VIDEO_FOLDER = "mediacast_video_folder" MEETING = "meeting" + MOB_VIDEO = "mob_video" OPENCAST_VIDEO = "opencast_video" OPENCAST_VIDEO_FOLDER = "opencast_video_folder" OPENCAST_VIDEO_FOLDER_MAYBE_PAGINATED = "opencast_video_folder_maybe_paginated" @@ -745,6 +746,7 @@ class IliasPage: result += self._find_cards() result += self._find_mediacast_videos() + result += self._find_mob_videos() return result @@ -773,6 +775,37 @@ class IliasPage: return videos + def _find_mob_videos(self) -> List[IliasPageElement]: + videos: List[IliasPageElement] = [] + + for figure in self._soup.select("figure.ilc_media_cont_MediaContainerHighlighted"): + title = figure.select_one("figcaption").getText().strip() + ".mp4" + video_element = figure.select_one("video") + if not video_element: + _unexpected_html_warning() + log.warn_contd(f"No