update sync_url.py and fix event_loop housekeeping

This commit is contained in:
be7a 2021-04-28 23:58:21 +02:00
parent 2d6be9f5c1
commit 3eab236b99
No known key found for this signature in database
GPG key ID: 6510870A77F49A99
3 changed files with 83 additions and 18 deletions

View file

@ -136,6 +136,10 @@ class IliasCrawler:
self._authenticator = authenticator self._authenticator = authenticator
self.dir_filter = dir_filter self.dir_filter = dir_filter
# Setup authentication locks
self.auth_event = asyncio.Event()
self.auth_lock = asyncio.Lock()
@staticmethod @staticmethod
def _url_set_query_param(url: str, param: str, value: str) -> str: def _url_set_query_param(url: str, param: str, value: str) -> str:
""" """
@ -188,7 +192,7 @@ class IliasCrawler:
""" """
Returns the name of the element at the given URL, if it can find one. Returns the name of the element at the given URL, if it can find one.
""" """
focus_element: bs4.Tag = await self._get_page(url, {}).find( focus_element: bs4.Tag = (await self._get_page(url, {})).find(
id="il_mhead_t_focus" id="il_mhead_t_focus"
) )
if not focus_element: if not focus_element:
@ -274,10 +278,6 @@ class IliasCrawler:
): ):
crawl_queue = asyncio.Queue() crawl_queue = asyncio.Queue()
# Setup authentication locks
self._auth_event = asyncio.Event()
self._auth_lock = asyncio.Lock()
for entry in entries: for entry in entries:
crawl_queue.put_nowait(entry) crawl_queue.put_nowait(entry)
@ -784,16 +784,16 @@ class IliasCrawler:
if self._is_logged_in(soup): if self._is_logged_in(soup):
return soup return soup
if self._auth_lock.locked(): if self.auth_lock.locked():
# Some other future is already logging in # Some other future is already logging in
await self._auth_event.wait() await self._auth_event.wait()
else: else:
await self._auth_lock.acquire() await self.auth_lock.acquire()
self._auth_event.clear() self.auth_event.clear()
LOGGER.info("Not authenticated, changing that...") LOGGER.info("Not authenticated, changing that...")
await self._authenticator.authenticate(self._client) await self._authenticator.authenticate(self._client)
self._auth_event.set() self.auth_event.set()
self._auth_lock.release() self.auth_lock.release()
return await self._get_page( return await self._get_page(
url, url,

View file

@ -94,6 +94,11 @@ class Pferd(Location):
self._test_run = test_run self._test_run = test_run
self._ilias_targets: List[IliasTarget] = [] self._ilias_targets: List[IliasTarget] = []
# Initiate event loop
# This is needed, because ILIASCrawler sets up syncronization primitives
# which are tied to the event loop, so it shouldnt cant change
self._loop = asyncio.get_event_loop()
@staticmethod @staticmethod
def enable_logging() -> None: def enable_logging() -> None:
""" """
@ -188,7 +193,7 @@ class Pferd(Location):
) )
self._ilias_targets.append(target) self._ilias_targets.append(target)
def add_ilias_folder( def add_ilias_course(
self, self,
ilias: IliasSycronizer, ilias: IliasSycronizer,
target: PathLike, target: PathLike,
@ -234,6 +239,53 @@ class Pferd(Location):
) )
self._ilias_targets.append(target) self._ilias_targets.append(target)
def add_ilias_folder(
self,
ilias: IliasSycronizer,
target: PathLike,
full_url: str,
transform: Transform = lambda x: x,
download_strategy: IliasDownloadStrategy = download_modified_or_new,
clean: bool = True,
timeout: int = 5,
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
) -> Organizer:
"""
Synchronizes a folder with a given folder on the given ILIAS instance.
Arguments:
ilias {IliasSycronizer} -- the ILIAS Instance
target {Path} -- the target path to write the data to
full_url {str} -- the full url of the folder/videos/course to crawl
Keyword Arguments:
transform {Transform} -- A transformation function for the output paths. Return None
to ignore a file. (default: {lambdax:x})
download_strategy {DownloadStrategy} -- A function to determine which files need to
be downloaded. Can save bandwidth and reduce the number of requests.
(default: {download_modified_or_new})
clean {bool} -- Whether to clean up when the method finishes.
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
requests bug.
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
with overwriting or deleting files. The default always asks the user.
"""
PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url")
results = ilias.add_target(
lambda crawler: crawler.recursive_crawl_url(full_url),
)
target = IliasTarget(
results,
target,
transform,
download_strategy,
clean,
timeout,
file_conflict_resolver,
)
self._ilias_targets.append(target)
async def _syncronize_ilias(self, ilias: IliasSycronizer): async def _syncronize_ilias(self, ilias: IliasSycronizer):
await ilias.syncronize() await ilias.syncronize()
@ -279,7 +331,7 @@ class Pferd(Location):
Arguments: Arguments:
ilias {IliasSycronizer} -- the ILIAS Instance ilias {IliasSycronizer} -- the ILIAS Instance
""" """
asyncio.run(self._syncronize_ilias(ilias)) self._loop.run_until_complete(self._syncronize_ilias(ilias))
def print_summary(self) -> None: def print_summary(self) -> None:
""" """

View file

@ -4,6 +4,7 @@
A simple script to download a course by name from ILIAS. A simple script to download a course by name from ILIAS.
""" """
import asyncio
import argparse import argparse
import logging import logging
import sys import sys
@ -86,7 +87,11 @@ def main() -> None:
args = parser.parse_args() args = parser.parse_args()
cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None) cookie_jar = CookieJar(to_path(args.cookies) if args.cookies else None)
client = cookie_jar.create_client() client = cookie_jar.create_async_client()
if not args.url.startswith("https://ilias.studium.kit.edu"):
_PRETTY.error("Not a valid KIT ILIAS URL")
return
if args.keyring: if args.keyring:
if not args.username: if not args.username:
@ -103,13 +108,14 @@ def main() -> None:
url = urlparse(args.url) url = urlparse(args.url)
loop = asyncio.get_event_loop()
crawler = IliasCrawler(url.scheme + '://' + url.netloc, client, crawler = IliasCrawler(url.scheme + '://' + url.netloc, client,
authenticator, lambda x, y: True) authenticator, lambda x, y: True)
cookie_jar.load_cookies() cookie_jar.load_cookies()
if args.folder is None: if args.folder is None:
element_name = crawler.find_element_name(args.url) element_name = loop.run_until_complete(crawler.find_element_name(args.url))
if not element_name: if not element_name:
print("Error, could not get element name. Please specify a folder yourself.") print("Error, could not get element name. Please specify a folder yourself.")
return return
@ -142,17 +148,24 @@ def main() -> None:
pferd.enable_logging() pferd.enable_logging()
# fetch # fetch
pferd.ilias_kit_folder(
target=target, ilias = pferd.ilias_kit(
full_url=args.url,
cookies=args.cookies,
dir_filter=dir_filter, dir_filter=dir_filter,
cookies=args.cookies,
username=username, username=username,
password=password, password=password,
)
pferd.add_ilias_folder(
ilias=ilias,
target=target,
full_url=args.url,
file_conflict_resolver=file_confilict_resolver, file_conflict_resolver=file_confilict_resolver,
transform=sanitize_windows_path transform=sanitize_windows_path
) )
pferd.syncronize_ilias(ilias)
pferd.print_summary() pferd.print_summary()