From 10d9d7452809aafe4f406f894944a078072f16bf Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sat, 8 Jan 2022 20:28:30 +0100 Subject: [PATCH] Bail out when crawling recursive courses --- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index c4e70c0..8f78e7a 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -182,6 +182,7 @@ instance's greatest bottleneck. self._link_file_redirect_delay = section.link_redirect_delay() self._links = section.links() self._videos = section.videos() + self._visited_urls: Set[str] = set() async def _run(self) -> None: if isinstance(self._target, int): @@ -309,6 +310,12 @@ instance's greatest bottleneck. parent_path: PurePath, element: IliasPageElement, ) -> Optional[Awaitable[None]]: + if element.url in self._visited_urls: + raise CrawlWarning( + f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath" + ) + self._visited_urls.add(element.url) + element_path = PurePath(parent_path, element.name) if element.type in _VIDEO_ELEMENTS: