From 3eab236b99872af3817486ca21c24f5cbd3073d3 Mon Sep 17 00:00:00 2001 From: be7a Date: Wed, 28 Apr 2021 23:58:21 +0200 Subject: [PATCH] update sync_url.py and fix event_loop housekeeping --- PFERD/ilias/crawler.py | 20 +++++++-------- PFERD/pferd.py | 56 ++++++++++++++++++++++++++++++++++++++++-- sync_url.py | 25 ++++++++++++++----- 3 files changed, 83 insertions(+), 18 deletions(-) diff --git a/PFERD/ilias/crawler.py b/PFERD/ilias/crawler.py index 66e876d..1f89e07 100644 --- a/PFERD/ilias/crawler.py +++ b/PFERD/ilias/crawler.py @@ -136,6 +136,10 @@ class IliasCrawler: self._authenticator = authenticator self.dir_filter = dir_filter + # Setup authentication locks + self.auth_event = asyncio.Event() + self.auth_lock = asyncio.Lock() + @staticmethod def _url_set_query_param(url: str, param: str, value: str) -> str: """ @@ -188,7 +192,7 @@ class IliasCrawler: """ Returns the name of the element at the given URL, if it can find one. """ - focus_element: bs4.Tag = await self._get_page(url, {}).find( + focus_element: bs4.Tag = (await self._get_page(url, {})).find( id="il_mhead_t_focus" ) if not focus_element: @@ -274,10 +278,6 @@ class IliasCrawler: ): crawl_queue = asyncio.Queue() - # Setup authentication locks - self._auth_event = asyncio.Event() - self._auth_lock = asyncio.Lock() - for entry in entries: crawl_queue.put_nowait(entry) @@ -784,16 +784,16 @@ class IliasCrawler: if self._is_logged_in(soup): return soup - if self._auth_lock.locked(): + if self.auth_lock.locked(): # Some other future is already logging in await self._auth_event.wait() else: - await self._auth_lock.acquire() - self._auth_event.clear() + await self.auth_lock.acquire() + self.auth_event.clear() LOGGER.info("Not authenticated, changing that...") await self._authenticator.authenticate(self._client) - self._auth_event.set() - self._auth_lock.release() + self.auth_event.set() + self.auth_lock.release() return await self._get_page( url, diff --git a/PFERD/pferd.py b/PFERD/pferd.py index a82f069..94ab93b 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -94,6 +94,11 @@ class Pferd(Location): self._test_run = test_run self._ilias_targets: List[IliasTarget] = [] + # Initiate event loop + # This is needed, because ILIASCrawler sets up syncronization primitives + # which are tied to the event loop, so it shouldnt cant change + self._loop = asyncio.get_event_loop() + @staticmethod def enable_logging() -> None: """ @@ -188,7 +193,7 @@ class Pferd(Location): ) self._ilias_targets.append(target) - def add_ilias_folder( + def add_ilias_course( self, ilias: IliasSycronizer, target: PathLike, @@ -234,6 +239,53 @@ class Pferd(Location): ) self._ilias_targets.append(target) + + def add_ilias_folder( + self, + ilias: IliasSycronizer, + target: PathLike, + full_url: str, + transform: Transform = lambda x: x, + download_strategy: IliasDownloadStrategy = download_modified_or_new, + clean: bool = True, + timeout: int = 5, + file_conflict_resolver: FileConflictResolver = resolve_prompt_user + ) -> Organizer: + """ + Synchronizes a folder with a given folder on the given ILIAS instance. + Arguments: + ilias {IliasSycronizer} -- the ILIAS Instance + target {Path} -- the target path to write the data to + full_url {str} -- the full url of the folder/videos/course to crawl + Keyword Arguments: + transform {Transform} -- A transformation function for the output paths. Return None + to ignore a file. (default: {lambdax:x}) + download_strategy {DownloadStrategy} -- A function to determine which files need to + be downloaded. Can save bandwidth and reduce the number of requests. + (default: {download_modified_or_new}) + clean {bool} -- Whether to clean up when the method finishes. + timeout {int} -- The download timeout for opencast videos. Sadly needed due to a + requests bug. + file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal + with overwriting or deleting files. The default always asks the user. + """ + PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url") + + + results = ilias.add_target( + lambda crawler: crawler.recursive_crawl_url(full_url), + ) + target = IliasTarget( + results, + target, + transform, + download_strategy, + clean, + timeout, + file_conflict_resolver, + ) + self._ilias_targets.append(target) + async def _syncronize_ilias(self, ilias: IliasSycronizer): await ilias.syncronize() @@ -279,7 +331,7 @@ class Pferd(Location): Arguments: ilias {IliasSycronizer} -- the ILIAS Instance """ - asyncio.run(self._syncronize_ilias(ilias)) + self._loop.run_until_complete(self._syncronize_ilias(ilias)) def print_summary(self) -> None: """ diff --git a/sync_url.py b/sync_url.py index 2ccbc95..9252776 100755 --- a/sync_url.py +++ b/sync_url.py @@ -4,6 +4,7 @@ A simple script to download a course by name from ILIAS. """ +import asyncio import argparse import logging import sys @@ -86,7 +87,11 @@ def main() -> None: args = parser.parse_args() cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) - client = cookie_jar.create_client() + client = cookie_jar.create_async_client() + + if not args.url.startswith("https://ilias.studium.kit.edu"): + _PRETTY.error("Not a valid KIT ILIAS URL") + return if args.keyring: if not args.username: @@ -103,13 +108,14 @@ def main() -> None: url = urlparse(args.url) + loop = asyncio.get_event_loop() crawler = IliasCrawler(url.scheme + '://' + url.netloc, client, authenticator, lambda x, y: True) cookie_jar.load_cookies() if args.folder is None: - element_name = crawler.find_element_name(args.url) + element_name = loop.run_until_complete(crawler.find_element_name(args.url)) if not element_name: print("Error, could not get element name. Please specify a folder yourself.") return @@ -142,17 +148,24 @@ def main() -> None: pferd.enable_logging() # fetch - pferd.ilias_kit_folder( - target=target, - full_url=args.url, - cookies=args.cookies, + + ilias = pferd.ilias_kit( dir_filter=dir_filter, + cookies=args.cookies, username=username, password=password, + ) + + pferd.add_ilias_folder( + ilias=ilias, + target=target, + full_url=args.url, file_conflict_resolver=file_confilict_resolver, transform=sanitize_windows_path ) + pferd.syncronize_ilias(ilias) + pferd.print_summary()