use the async client to crawl illias

This commit is contained in:
be7a 2021-04-23 21:43:52 +02:00
parent 44aeb6c2eb
commit 411d4b91d5
No known key found for this signature in database
GPG key ID: 6510870A77F49A99
5 changed files with 131 additions and 84 deletions

View file

@ -65,3 +65,10 @@ class CookieJar:
client.cookies = self.cookies # type: ignore client.cookies = self.cookies # type: ignore
return client return client
def create_async_client(self) -> httpx.AsyncClient:
"""Create a new async client using the cookie jar."""
# TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set
client = httpx.AsyncClient(timeout=None)
client.cookies = self.cookies
return client

View file

@ -24,9 +24,9 @@ class IliasAuthenticator(abc.ABC):
""" """
@abc.abstractmethod @abc.abstractmethod
def authenticate(self, client: httpx.Client) -> None: async def authenticate(self, client: httpx.AsyncClient) -> None:
""" """
Log a httpx client into this authenticator's ILIAS account. Log a httpx AsyncClient into this authenticator's ILIAS account.
""" """
@ -45,7 +45,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth") self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth")
def authenticate(self, sess: httpx.Client) -> None: async def authenticate(self, client: httpx.AsyncClient) -> None:
""" """
Performs the ILIAS Shibboleth authentication dance and saves the login Performs the ILIAS Shibboleth authentication dance and saves the login
cookies it receieves. cookies it receieves.
@ -65,7 +65,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
"target": "/shib_login.php", "target": "/shib_login.php",
"home_organization_selection": "Mit KIT-Account anmelden", "home_organization_selection": "Mit KIT-Account anmelden",
} }
soup = soupify(sess.post(url, data=data)) soup = soupify(await client.post(url, data=data))
# Attempt to login using credentials, if necessary # Attempt to login using credentials, if necessary
while not self._login_successful(soup): while not self._login_successful(soup):
@ -86,10 +86,10 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
"j_password": self._auth.password, "j_password": self._auth.password,
"csrf_token": csrf_token "csrf_token": csrf_token
} }
soup = soupify(sess.post(url, data=data)) soup = soupify(await client.post(url, data=data))
if self._tfa_required(soup): if self._tfa_required(soup):
soup = self._authenticate_tfa(sess, soup) soup = await self._authenticate_tfa(client, soup)
if not self._login_successful(soup): if not self._login_successful(soup):
print("Incorrect credentials.") print("Incorrect credentials.")
@ -105,11 +105,11 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
"RelayState": relay_state["value"], "RelayState": relay_state["value"],
"SAMLResponse": saml_response["value"], "SAMLResponse": saml_response["value"],
} }
sess.post(url, data=data) await client.post(url, data=data)
def _authenticate_tfa( async def _authenticate_tfa(
self, self,
client: httpx.Client, client: httpx.AsyncClient,
soup: bs4.BeautifulSoup soup: bs4.BeautifulSoup
) -> bs4.BeautifulSoup: ) -> bs4.BeautifulSoup:
# Searching the form here so that this fails before asking for # Searching the form here so that this fails before asking for
@ -125,7 +125,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
"_eventId_proceed": "", "_eventId_proceed": "",
"j_tokenNumber": self._tfa_auth.get_token() "j_tokenNumber": self._tfa_auth.get_token()
} }
return soupify(client.post(url, data=data)) return soupify(await client.post(url, data=data))
@staticmethod @staticmethod
def _login_successful(soup: bs4.BeautifulSoup) -> bool: def _login_successful(soup: bs4.BeautifulSoup) -> bool:

View file

@ -2,18 +2,20 @@
Contains an ILIAS crawler alongside helper functions. Contains an ILIAS crawler alongside helper functions.
""" """
from asyncio.queues import Queue
import datetime import datetime
import json import json
import logging import logging
import re import re
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Awaitable, Dict, List, Optional, Union
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit, from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
urlunsplit) urlunsplit)
import bs4 import bs4
import httpx import httpx
import asyncio
from ..errors import FatalException, retry_on_io_exception from ..errors import FatalException, retry_on_io_exception
from ..logging import PrettyLogger from ..logging import PrettyLogger
@ -62,14 +64,16 @@ class IliasCrawlerEntry:
def __init__( def __init__(
self, self,
path: Path, path: Path,
url: Union[str, Callable[[], Optional[str]]], url: Union[str, Callable[[], Awaitable[Optional[str]]]],
entry_type: IliasElementType, entry_type: IliasElementType,
modification_date: Optional[datetime.datetime] modification_date: Optional[datetime.datetime]
): ):
self.path = path self.path = path
if isinstance(url, str): if isinstance(url, str):
str_url = url # TODO: Dirty hack, remove
self.url: Callable[[], Optional[str]] = lambda: str_url future = asyncio.Future()
future.set_result(url)
self.url: Callable[[], Awaitable[Optional[str]]] = lambda: future
else: else:
self.url = url self.url = url
self.entry_type = entry_type self.entry_type = entry_type
@ -96,7 +100,7 @@ class IliasCrawler:
def __init__( def __init__(
self, self,
base_url: str, base_url: str,
client: httpx.Client, client: httpx.AsyncClient,
authenticator: IliasAuthenticator, authenticator: IliasAuthenticator,
dir_filter: IliasDirectoryFilter dir_filter: IliasDirectoryFilter
): ):
@ -121,17 +125,17 @@ class IliasCrawler:
return urlunsplit((scheme, netloc, path, new_query_string, fragment)) return urlunsplit((scheme, netloc, path, new_query_string, fragment))
def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]: async def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
""" """
Crawls a given url *and all reachable elements in it*. Crawls a given url *and all reachable elements in it*.
Args: Args:
url {str} -- the *full* url to crawl url {str} -- the *full* url to crawl
""" """
start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url) start_entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), url)
return self._iterate_entries_to_download_infos(start_entries) return await self._iterate_entries_to_download_infos(start_entries)
def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]: async def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
""" """
Starts the crawl process for a course, yielding a list of elements to (potentially) Starts the crawl process for a course, yielding a list of elements to (potentially)
download. download.
@ -147,28 +151,28 @@ class IliasCrawler:
self._base_url + "/goto.php", "target", f"crs_{course_id}" self._base_url + "/goto.php", "target", f"crs_{course_id}"
) )
if not self._is_course_id_valid(root_url, course_id): if not await self._is_course_id_valid(root_url, course_id):
raise FatalException( raise FatalException(
"Invalid course id? I didn't find anything looking like a course!" "Invalid course id? I didn't find anything looking like a course!"
) )
# And treat it as a folder # And treat it as a folder
entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url) entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), root_url)
return self._iterate_entries_to_download_infos(entries) return await self._iterate_entries_to_download_infos(entries)
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool: async def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
response: httpx.Response = self._client.get(root_url) response: httpx.Response = await self._client.get(root_url)
# We were redirected ==> Non-existant ID # We were redirected ==> Non-existant ID
if course_id not in str(response.url): if course_id not in str(response.url):
return False return False
link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link") link_element: bs4.Tag = (await self._get_page(root_url, {})).find(id="current_perma_link")
if not link_element: if not link_element:
return False return False
# It wasn't a course but a category list, forum, etc. # It wasn't a course but a category list, forum, etc.
return "crs_" in link_element.get("value") return "crs_" in link_element.get("value")
def find_course_name(self, course_id: str) -> Optional[str]: async def find_course_name(self, course_id: str) -> Optional[str]:
""" """
Returns the name of a given course. None if it is not a valid course Returns the name of a given course. None if it is not a valid course
or it could not be found. or it could not be found.
@ -176,81 +180,111 @@ class IliasCrawler:
course_url = self._url_set_query_param( course_url = self._url_set_query_param(
self._base_url + "/goto.php", "target", f"crs_{course_id}" self._base_url + "/goto.php", "target", f"crs_{course_id}"
) )
return self.find_element_name(course_url) return await self.find_element_name(course_url)
def find_element_name(self, url: str) -> Optional[str]: async def find_element_name(self, url: str) -> Optional[str]:
""" """
Returns the name of the element at the given URL, if it can find one. Returns the name of the element at the given URL, if it can find one.
""" """
focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus") focus_element: bs4.Tag = await self._get_page(url, {}).find(id="il_mhead_t_focus")
if not focus_element: if not focus_element:
return None return None
return focus_element.text return focus_element.text
def crawl_personal_desktop(self) -> List[IliasDownloadInfo]: async def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
""" """
Crawls the ILIAS personal desktop (and every subelements that can be reached from there). Crawls the ILIAS personal desktop (and every subelements that can be reached from there).
Raises: Raises:
FatalException: if an unrecoverable error occurs FatalException: if an unrecoverable error occurs
""" """
entries: List[IliasCrawlerEntry] = self._crawl_folder( entries: List[IliasCrawlerEntry] = await self._crawl_folder(
Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI" Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI"
) )
return self._iterate_entries_to_download_infos(entries) return await self._iterate_entries_to_download_infos(entries)
def _iterate_entries_to_download_infos( async def _crawl_worker(self, entries_to_process: asyncio.Queue, result: List[IliasDownloadInfo]):
self, while True:
entries: List[IliasCrawlerEntry] entry = await entries_to_process.get()
) -> List[IliasDownloadInfo]:
result: List[IliasDownloadInfo] = []
entries_to_process: List[IliasCrawlerEntry] = entries.copy()
while len(entries_to_process) > 0:
entry = entries_to_process.pop()
if entry.entry_type == IliasElementType.EXTERNAL_LINK: if entry.entry_type == IliasElementType.EXTERNAL_LINK:
PRETTY.not_searching(entry.path, "external link") PRETTY.not_searching(entry.path, "external link")
entries_to_process.task_done()
continue continue
if entry.entry_type == IliasElementType.FORUM: if entry.entry_type == IliasElementType.FORUM:
PRETTY.not_searching(entry.path, "forum") PRETTY.not_searching(entry.path, "forum")
entries_to_process.task_done()
continue continue
if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type): if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type):
PRETTY.not_searching(entry.path, "user filter") PRETTY.not_searching(entry.path, "user filter")
entries_to_process.task_done()
continue continue
download_info = entry.to_download_info() download_info = entry.to_download_info()
if download_info is not None: if download_info is not None:
result.append(download_info) result.append(download_info)
entries_to_process.task_done()
continue continue
url = entry.url() url = await entry.url()
if url is None: if url is None:
PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it") PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it")
entries_to_process.task_done()
continue continue
PRETTY.searching(entry.path) PRETTY.searching(entry.path)
if entry.entry_type == IliasElementType.EXERCISE_FOLDER: if entry.entry_type == IliasElementType.EXERCISE_FOLDER:
entries_to_process += self._crawl_exercises(entry.path, url) for task in await self._crawl_exercises(entry.path, url):
entries_to_process.put_nowait(task)
entries_to_process.task_done()
continue continue
if entry.entry_type == IliasElementType.REGULAR_FOLDER: if entry.entry_type == IliasElementType.REGULAR_FOLDER:
entries_to_process += self._crawl_folder(entry.path, url) for task in await self._crawl_folder(entry.path, url):
entries_to_process.put_nowait(task)
entries_to_process.task_done()
continue continue
if entry.entry_type == IliasElementType.VIDEO_FOLDER: if entry.entry_type == IliasElementType.VIDEO_FOLDER:
entries_to_process += self._crawl_video_directory(entry.path, url) for task in await self._crawl_video_directory(entry.path, url):
entries_to_process.put_nowait(task)
entries_to_process.task_done()
continue continue
PRETTY.warning(f"Unknown type: {entry.entry_type}!") PRETTY.warning(f"Unknown type: {entry.entry_type}!")
async def _iterate_entries_to_download_infos(
self,
entries: List[IliasCrawlerEntry]
) -> List[IliasDownloadInfo]:
result: List[IliasDownloadInfo] = []
crawl_queue = asyncio.Queue()
for entry in entries:
crawl_queue.put_nowait(entry)
workers = []
# TODO: Find proper worker limit
for _ in range(10):
worker = asyncio.create_task(self._crawl_worker(crawl_queue, result))
workers.append(worker)
await crawl_queue.join()
for worker in workers:
worker.cancel()
# Wait until all worker tasks are cancelled.
await asyncio.gather(*workers, return_exceptions=True)
return result return result
def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]: async def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]:
""" """
Crawl all files in a folder-like element. Crawl all files in a folder-like element.
""" """
soup = self._get_page(url, {}) soup = await self._get_page(url, {})
if soup.find(id="headerimage"): if soup.find(id="headerimage"):
element: bs4.Tag = soup.find(id="headerimage") element: bs4.Tag = soup.find(id="headerimage")
@ -415,18 +449,18 @@ class IliasCrawler:
IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date) IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date)
] ]
def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]: async def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]:
""" """
Crawl the video overview site. Crawl the video overview site.
""" """
initial_soup = self._get_page(url, {}) initial_soup = await self._get_page(url, {})
# The page is actually emtpy but contains a much needed token in the link below. # The page is actually emtpy but contains a much needed token in the link below.
# That token can be used to fetch the *actual* video listing # That token can be used to fetch the *actual* video listing
content_link: bs4.Tag = initial_soup.select_one("#tab_series a") content_link: bs4.Tag = initial_soup.select_one("#tab_series a")
# Fetch the actual video listing. The given parameters return all videos (max 800) # Fetch the actual video listing. The given parameters return all videos (max 800)
# in a standalone html page # in a standalone html page
video_list_soup = self._get_page( video_list_soup = await self._get_page(
self._abs_url_from_link(content_link), self._abs_url_from_link(content_link),
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} {"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
) )
@ -445,7 +479,7 @@ class IliasCrawler:
def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool: def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool:
return soup.find(id=re.compile(r"tab_page_sel.+")) is not None return soup.find(id=re.compile(r"tab_page_sel.+")) is not None
def _crawl_paginated_video_directory( async def _crawl_paginated_video_directory(
self, self,
video_dir_path: Path, video_dir_path: Path,
paged_video_list_soup: bs4.BeautifulSoup, paged_video_list_soup: bs4.BeautifulSoup,
@ -475,7 +509,7 @@ class IliasCrawler:
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup) return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
table_id = match.group(1) table_id = match.group(1)
extended_video_page = self._get_page( extended_video_page = await self._get_page(
second_stage_url, second_stage_url,
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"} {f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
) )
@ -557,14 +591,14 @@ class IliasCrawler:
modification_time modification_time
)] )]
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]: def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Awaitable[Optional[str]]]:
def inner() -> Optional[str]: async def inner() -> Optional[str]:
# Fetch the actual video page. This is a small wrapper page initializing a javscript # Fetch the actual video page. This is a small wrapper page initializing a javscript
# player. Sadly we can not execute that JS. The actual video stream url is nowhere # player. Sadly we can not execute that JS. The actual video stream url is nowhere
# on the page, but defined in a JS object inside a script tag, passed to the player # on the page, but defined in a JS object inside a script tag, passed to the player
# library. # library.
# We do the impossible and RegEx the stream JSON object out of the page's HTML source # We do the impossible and RegEx the stream JSON object out of the page's HTML source
video_page_soup = soupify(self._client.get(play_url)) video_page_soup = soupify(await self._client.get(play_url))
regex: re.Pattern = re.compile( regex: re.Pattern = re.compile(
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
) )
@ -582,11 +616,11 @@ class IliasCrawler:
return video_url return video_url
return inner return inner
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]: async def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]:
""" """
Crawl files offered for download in exercises. Crawl files offered for download in exercises.
""" """
soup = self._get_page(url, {}) soup = await self._get_page(url, {})
results: List[IliasCrawlerEntry] = [] results: List[IliasCrawlerEntry] = []
@ -626,7 +660,7 @@ class IliasCrawler:
return results return results
@retry_on_io_exception(3, "fetching webpage") @retry_on_io_exception(3, "fetching webpage")
def _get_page(self, url: str, params: Dict[str, Any], async def _get_page(self, url: str, params: Dict[str, Any],
retry_count: int = 0) -> bs4.BeautifulSoup: retry_count: int = 0) -> bs4.BeautifulSoup:
""" """
Fetches a page from ILIAS, authenticating when needed. Fetches a page from ILIAS, authenticating when needed.
@ -639,7 +673,7 @@ class IliasCrawler:
LOGGER.debug("Fetching %r", url) LOGGER.debug("Fetching %r", url)
response = self._client.get(url, params=params) response = await self._client.get(url, params=params)
content_type = response.headers["content-type"] content_type = response.headers["content-type"]
if not content_type.startswith("text/html"): if not content_type.startswith("text/html"):
@ -657,7 +691,7 @@ class IliasCrawler:
self._authenticator.authenticate(self._client) self._authenticator.authenticate(self._client)
return self._get_page(url, params, retry_count + 1) return await self._get_page(url, params, retry_count + 1)
@staticmethod @staticmethod
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool: def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:

View file

@ -5,10 +5,11 @@ import logging
import math import math
import os import os
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import Callable, List, Optional, Union from typing import Callable, Awaitable, List, Optional, Union
import bs4 import bs4
import httpx import httpx
import asyncio
from ..errors import retry_on_io_exception from ..errors import retry_on_io_exception
from ..logging import PrettyLogger from ..logging import PrettyLogger
@ -34,13 +35,14 @@ class IliasDownloadInfo(Transformable):
def __init__( def __init__(
self, self,
path: PurePath, path: PurePath,
url: Union[str, Callable[[], Optional[str]]], url: Union[str, Callable[[], Awaitable[Optional[str]]]],
modifcation_date: Optional[datetime.datetime] modifcation_date: Optional[datetime.datetime]
): ):
super().__init__(path) super().__init__(path)
if isinstance(url, str): if isinstance(url, str):
string_url = url future = asyncio.Future()
self.url: Callable[[], Optional[str]] = lambda: string_url future.set_result(url)
self.url: Callable[[], Optional[str]] = lambda: future
else: else:
self.url = url self.url = url
self.modification_date = modifcation_date self.modification_date = modifcation_date
@ -98,15 +100,15 @@ class IliasDownloader:
self._strategy = strategy self._strategy = strategy
self._timeout = timeout self._timeout = timeout
def download_all(self, infos: List[IliasDownloadInfo]) -> None: async def download_all(self, infos: List[IliasDownloadInfo]) -> None:
""" """
Download multiple files one after the other. Download multiple files one after the other.
""" """
for info in infos: tasks = [self.download(info) for info in infos]
self.download(info) await asyncio.gather(*tasks)
def download(self, info: IliasDownloadInfo) -> None: async def download(self, info: IliasDownloadInfo) -> None:
""" """
Download a file from ILIAS. Download a file from ILIAS.
@ -122,15 +124,15 @@ class IliasDownloader:
tmp_file = self._tmp_dir.new_path() tmp_file = self._tmp_dir.new_path()
@retry_on_io_exception(3, "downloading file") @retry_on_io_exception(3, "downloading file")
def download_impl() -> bool: async def download_impl() -> bool:
if not self._try_download(info, tmp_file): if not await self._try_download(info, tmp_file):
LOGGER.info("Re-Authenticating due to download failure: %r", info) LOGGER.info("Re-Authenticating due to download failure: %r", info)
self._authenticator.authenticate(self._client) self._authenticator.authenticate(self._client)
raise IOError("Scheduled retry") raise IOError("Scheduled retry")
else: else:
return True return True
if not download_impl(): if not await download_impl():
PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...") PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...")
return return
@ -144,8 +146,8 @@ class IliasDownloader:
) )
) )
def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool: async def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
url = info.url() url = await info.url()
if url is None: if url is None:
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/") PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
return True return True

View file

@ -4,7 +4,8 @@ Convenience functions for using PFERD.
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Callable, List, Optional, Union from typing import Callable, Awaitable, List, Optional, Union
import asyncio
from .authenticators import UserPassAuthenticator from .authenticators import UserPassAuthenticator
from .cookie_jar import CookieJar from .cookie_jar import CookieJar
@ -72,11 +73,11 @@ class Pferd(Location):
inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password) inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password)
return KitShibbolethAuthenticator(inner_auth) return KitShibbolethAuthenticator(inner_auth)
def _ilias( async def _ilias(
self, self,
target: PathLike, target: PathLike,
base_url: str, base_url: str,
crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]], crawl_function: Callable[[IliasCrawler], Awaitable[List[IliasDownloadInfo]]],
authenticator: IliasAuthenticator, authenticator: IliasAuthenticator,
cookies: Optional[PathLike], cookies: Optional[PathLike],
dir_filter: IliasDirectoryFilter, dir_filter: IliasDirectoryFilter,
@ -89,28 +90,31 @@ class Pferd(Location):
# pylint: disable=too-many-locals # pylint: disable=too-many-locals
cookie_jar = CookieJar(to_path(cookies) if cookies else None) cookie_jar = CookieJar(to_path(cookies) if cookies else None)
client = cookie_jar.create_client() client = cookie_jar.create_client()
async_client = cookie_jar.create_async_client()
tmp_dir = self._tmp_dir.new_subdir() tmp_dir = self._tmp_dir.new_subdir()
organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver) organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)
crawler = IliasCrawler(base_url, client, authenticator, dir_filter) crawler = IliasCrawler(base_url, async_client, authenticator, dir_filter)
downloader = IliasDownloader(tmp_dir, organizer, client, downloader = IliasDownloader(tmp_dir, organizer, client,
authenticator, download_strategy, timeout) authenticator, download_strategy, timeout)
cookie_jar.load_cookies() cookie_jar.load_cookies()
info = crawl_function(crawler) info = await crawl_function(crawler)
cookie_jar.save_cookies() cookie_jar.save_cookies()
transformed = apply_transform(transform, info) transformed = apply_transform(transform, info)
if self._test_run: if self._test_run:
self._print_transformables(transformed) self._print_transformables(transformed)
return organizer return organizer
downloader.download_all(transformed) await downloader.download_all(transformed)
cookie_jar.save_cookies() cookie_jar.save_cookies()
if clean: if clean:
organizer.cleanup() organizer.cleanup()
await async_client.aclose()
return organizer return organizer
@swallow_and_print_errors @swallow_and_print_errors
@ -161,7 +165,7 @@ class Pferd(Location):
authenticator = Pferd._get_authenticator(username=username, password=password) authenticator = Pferd._get_authenticator(username=username, password=password)
PRETTY.starting_synchronizer(target, "ILIAS", course_id) PRETTY.starting_synchronizer(target, "ILIAS", course_id)
organizer = self._ilias( organizer = asyncio.run(self._ilias(
target=target, target=target,
base_url="https://ilias.studium.kit.edu/", base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.crawl_course(course_id), crawl_function=lambda crawler: crawler.crawl_course(course_id),
@ -173,7 +177,7 @@ class Pferd(Location):
clean=clean, clean=clean,
timeout=timeout, timeout=timeout,
file_conflict_resolver=file_conflict_resolver file_conflict_resolver=file_conflict_resolver
) ))
self._download_summary.merge(organizer.download_summary) self._download_summary.merge(organizer.download_summary)
@ -230,7 +234,7 @@ class Pferd(Location):
authenticator = Pferd._get_authenticator(username, password) authenticator = Pferd._get_authenticator(username, password)
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop") PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
organizer = self._ilias( organizer = asyncio.run(self._ilias(
target=target, target=target,
base_url="https://ilias.studium.kit.edu/", base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.crawl_personal_desktop(), crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
@ -242,7 +246,7 @@ class Pferd(Location):
clean=clean, clean=clean,
timeout=timeout, timeout=timeout,
file_conflict_resolver=file_conflict_resolver file_conflict_resolver=file_conflict_resolver
) ))
self._download_summary.merge(organizer.download_summary) self._download_summary.merge(organizer.download_summary)
@ -298,7 +302,7 @@ class Pferd(Location):
if not full_url.startswith("https://ilias.studium.kit.edu"): if not full_url.startswith("https://ilias.studium.kit.edu"):
raise FatalException("Not a valid KIT ILIAS URL") raise FatalException("Not a valid KIT ILIAS URL")
organizer = self._ilias( organizer = asyncio.run(self._ilias(
target=target, target=target,
base_url="https://ilias.studium.kit.edu/", base_url="https://ilias.studium.kit.edu/",
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url), crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
@ -310,7 +314,7 @@ class Pferd(Location):
clean=clean, clean=clean,
timeout=timeout, timeout=timeout,
file_conflict_resolver=file_conflict_resolver file_conflict_resolver=file_conflict_resolver
) ))
self._download_summary.merge(organizer.download_summary) self._download_summary.merge(organizer.download_summary)