mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-13 07:55:05 +02:00
use the async client to crawl illias
This commit is contained in:
parent
44aeb6c2eb
commit
411d4b91d5
5 changed files with 131 additions and 84 deletions
|
|
@ -65,3 +65,10 @@ class CookieJar:
|
||||||
client.cookies = self.cookies # type: ignore
|
client.cookies = self.cookies # type: ignore
|
||||||
|
|
||||||
return client
|
return client
|
||||||
|
|
||||||
|
def create_async_client(self) -> httpx.AsyncClient:
|
||||||
|
"""Create a new async client using the cookie jar."""
|
||||||
|
# TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set
|
||||||
|
client = httpx.AsyncClient(timeout=None)
|
||||||
|
client.cookies = self.cookies
|
||||||
|
return client
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,9 @@ class IliasAuthenticator(abc.ABC):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def authenticate(self, client: httpx.Client) -> None:
|
async def authenticate(self, client: httpx.AsyncClient) -> None:
|
||||||
"""
|
"""
|
||||||
Log a httpx client into this authenticator's ILIAS account.
|
Log a httpx AsyncClient into this authenticator's ILIAS account.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -45,7 +45,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
||||||
|
|
||||||
self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth")
|
self._tfa_auth = TfaAuthenticator("KIT ILIAS Shibboleth")
|
||||||
|
|
||||||
def authenticate(self, sess: httpx.Client) -> None:
|
async def authenticate(self, client: httpx.AsyncClient) -> None:
|
||||||
"""
|
"""
|
||||||
Performs the ILIAS Shibboleth authentication dance and saves the login
|
Performs the ILIAS Shibboleth authentication dance and saves the login
|
||||||
cookies it receieves.
|
cookies it receieves.
|
||||||
|
|
@ -65,7 +65,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
||||||
"target": "/shib_login.php",
|
"target": "/shib_login.php",
|
||||||
"home_organization_selection": "Mit KIT-Account anmelden",
|
"home_organization_selection": "Mit KIT-Account anmelden",
|
||||||
}
|
}
|
||||||
soup = soupify(sess.post(url, data=data))
|
soup = soupify(await client.post(url, data=data))
|
||||||
|
|
||||||
# Attempt to login using credentials, if necessary
|
# Attempt to login using credentials, if necessary
|
||||||
while not self._login_successful(soup):
|
while not self._login_successful(soup):
|
||||||
|
|
@ -86,10 +86,10 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
||||||
"j_password": self._auth.password,
|
"j_password": self._auth.password,
|
||||||
"csrf_token": csrf_token
|
"csrf_token": csrf_token
|
||||||
}
|
}
|
||||||
soup = soupify(sess.post(url, data=data))
|
soup = soupify(await client.post(url, data=data))
|
||||||
|
|
||||||
if self._tfa_required(soup):
|
if self._tfa_required(soup):
|
||||||
soup = self._authenticate_tfa(sess, soup)
|
soup = await self._authenticate_tfa(client, soup)
|
||||||
|
|
||||||
if not self._login_successful(soup):
|
if not self._login_successful(soup):
|
||||||
print("Incorrect credentials.")
|
print("Incorrect credentials.")
|
||||||
|
|
@ -105,11 +105,11 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
||||||
"RelayState": relay_state["value"],
|
"RelayState": relay_state["value"],
|
||||||
"SAMLResponse": saml_response["value"],
|
"SAMLResponse": saml_response["value"],
|
||||||
}
|
}
|
||||||
sess.post(url, data=data)
|
await client.post(url, data=data)
|
||||||
|
|
||||||
def _authenticate_tfa(
|
async def _authenticate_tfa(
|
||||||
self,
|
self,
|
||||||
client: httpx.Client,
|
client: httpx.AsyncClient,
|
||||||
soup: bs4.BeautifulSoup
|
soup: bs4.BeautifulSoup
|
||||||
) -> bs4.BeautifulSoup:
|
) -> bs4.BeautifulSoup:
|
||||||
# Searching the form here so that this fails before asking for
|
# Searching the form here so that this fails before asking for
|
||||||
|
|
@ -125,7 +125,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
||||||
"_eventId_proceed": "",
|
"_eventId_proceed": "",
|
||||||
"j_tokenNumber": self._tfa_auth.get_token()
|
"j_tokenNumber": self._tfa_auth.get_token()
|
||||||
}
|
}
|
||||||
return soupify(client.post(url, data=data))
|
return soupify(await client.post(url, data=data))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _login_successful(soup: bs4.BeautifulSoup) -> bool:
|
def _login_successful(soup: bs4.BeautifulSoup) -> bool:
|
||||||
|
|
|
||||||
|
|
@ -2,18 +2,20 @@
|
||||||
Contains an ILIAS crawler alongside helper functions.
|
Contains an ILIAS crawler alongside helper functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from asyncio.queues import Queue
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Awaitable, Dict, List, Optional, Union
|
||||||
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
|
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
|
||||||
urlunsplit)
|
urlunsplit)
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import httpx
|
import httpx
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from ..errors import FatalException, retry_on_io_exception
|
from ..errors import FatalException, retry_on_io_exception
|
||||||
from ..logging import PrettyLogger
|
from ..logging import PrettyLogger
|
||||||
|
|
@ -62,14 +64,16 @@ class IliasCrawlerEntry:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
path: Path,
|
path: Path,
|
||||||
url: Union[str, Callable[[], Optional[str]]],
|
url: Union[str, Callable[[], Awaitable[Optional[str]]]],
|
||||||
entry_type: IliasElementType,
|
entry_type: IliasElementType,
|
||||||
modification_date: Optional[datetime.datetime]
|
modification_date: Optional[datetime.datetime]
|
||||||
):
|
):
|
||||||
self.path = path
|
self.path = path
|
||||||
if isinstance(url, str):
|
if isinstance(url, str):
|
||||||
str_url = url
|
# TODO: Dirty hack, remove
|
||||||
self.url: Callable[[], Optional[str]] = lambda: str_url
|
future = asyncio.Future()
|
||||||
|
future.set_result(url)
|
||||||
|
self.url: Callable[[], Awaitable[Optional[str]]] = lambda: future
|
||||||
else:
|
else:
|
||||||
self.url = url
|
self.url = url
|
||||||
self.entry_type = entry_type
|
self.entry_type = entry_type
|
||||||
|
|
@ -96,7 +100,7 @@ class IliasCrawler:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
client: httpx.Client,
|
client: httpx.AsyncClient,
|
||||||
authenticator: IliasAuthenticator,
|
authenticator: IliasAuthenticator,
|
||||||
dir_filter: IliasDirectoryFilter
|
dir_filter: IliasDirectoryFilter
|
||||||
):
|
):
|
||||||
|
|
@ -121,17 +125,17 @@ class IliasCrawler:
|
||||||
|
|
||||||
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
|
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
|
||||||
|
|
||||||
def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
|
async def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
|
||||||
"""
|
"""
|
||||||
Crawls a given url *and all reachable elements in it*.
|
Crawls a given url *and all reachable elements in it*.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url {str} -- the *full* url to crawl
|
url {str} -- the *full* url to crawl
|
||||||
"""
|
"""
|
||||||
start_entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), url)
|
start_entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), url)
|
||||||
return self._iterate_entries_to_download_infos(start_entries)
|
return await self._iterate_entries_to_download_infos(start_entries)
|
||||||
|
|
||||||
def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
|
async def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
|
||||||
"""
|
"""
|
||||||
Starts the crawl process for a course, yielding a list of elements to (potentially)
|
Starts the crawl process for a course, yielding a list of elements to (potentially)
|
||||||
download.
|
download.
|
||||||
|
|
@ -147,28 +151,28 @@ class IliasCrawler:
|
||||||
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if not self._is_course_id_valid(root_url, course_id):
|
if not await self._is_course_id_valid(root_url, course_id):
|
||||||
raise FatalException(
|
raise FatalException(
|
||||||
"Invalid course id? I didn't find anything looking like a course!"
|
"Invalid course id? I didn't find anything looking like a course!"
|
||||||
)
|
)
|
||||||
|
|
||||||
# And treat it as a folder
|
# And treat it as a folder
|
||||||
entries: List[IliasCrawlerEntry] = self._crawl_folder(Path(""), root_url)
|
entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), root_url)
|
||||||
return self._iterate_entries_to_download_infos(entries)
|
return await self._iterate_entries_to_download_infos(entries)
|
||||||
|
|
||||||
def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
|
async def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
|
||||||
response: httpx.Response = self._client.get(root_url)
|
response: httpx.Response = await self._client.get(root_url)
|
||||||
# We were redirected ==> Non-existant ID
|
# We were redirected ==> Non-existant ID
|
||||||
if course_id not in str(response.url):
|
if course_id not in str(response.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
link_element: bs4.Tag = self._get_page(root_url, {}).find(id="current_perma_link")
|
link_element: bs4.Tag = (await self._get_page(root_url, {})).find(id="current_perma_link")
|
||||||
if not link_element:
|
if not link_element:
|
||||||
return False
|
return False
|
||||||
# It wasn't a course but a category list, forum, etc.
|
# It wasn't a course but a category list, forum, etc.
|
||||||
return "crs_" in link_element.get("value")
|
return "crs_" in link_element.get("value")
|
||||||
|
|
||||||
def find_course_name(self, course_id: str) -> Optional[str]:
|
async def find_course_name(self, course_id: str) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Returns the name of a given course. None if it is not a valid course
|
Returns the name of a given course. None if it is not a valid course
|
||||||
or it could not be found.
|
or it could not be found.
|
||||||
|
|
@ -176,81 +180,111 @@ class IliasCrawler:
|
||||||
course_url = self._url_set_query_param(
|
course_url = self._url_set_query_param(
|
||||||
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
||||||
)
|
)
|
||||||
return self.find_element_name(course_url)
|
return await self.find_element_name(course_url)
|
||||||
|
|
||||||
def find_element_name(self, url: str) -> Optional[str]:
|
async def find_element_name(self, url: str) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Returns the name of the element at the given URL, if it can find one.
|
Returns the name of the element at the given URL, if it can find one.
|
||||||
"""
|
"""
|
||||||
focus_element: bs4.Tag = self._get_page(url, {}).find(id="il_mhead_t_focus")
|
focus_element: bs4.Tag = await self._get_page(url, {}).find(id="il_mhead_t_focus")
|
||||||
if not focus_element:
|
if not focus_element:
|
||||||
return None
|
return None
|
||||||
return focus_element.text
|
return focus_element.text
|
||||||
|
|
||||||
def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
|
async def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
|
||||||
"""
|
"""
|
||||||
Crawls the ILIAS personal desktop (and every subelements that can be reached from there).
|
Crawls the ILIAS personal desktop (and every subelements that can be reached from there).
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
FatalException: if an unrecoverable error occurs
|
FatalException: if an unrecoverable error occurs
|
||||||
"""
|
"""
|
||||||
entries: List[IliasCrawlerEntry] = self._crawl_folder(
|
entries: List[IliasCrawlerEntry] = await self._crawl_folder(
|
||||||
Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI"
|
Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI"
|
||||||
)
|
)
|
||||||
return self._iterate_entries_to_download_infos(entries)
|
return await self._iterate_entries_to_download_infos(entries)
|
||||||
|
|
||||||
def _iterate_entries_to_download_infos(
|
async def _crawl_worker(self, entries_to_process: asyncio.Queue, result: List[IliasDownloadInfo]):
|
||||||
self,
|
while True:
|
||||||
entries: List[IliasCrawlerEntry]
|
entry = await entries_to_process.get()
|
||||||
) -> List[IliasDownloadInfo]:
|
|
||||||
result: List[IliasDownloadInfo] = []
|
|
||||||
entries_to_process: List[IliasCrawlerEntry] = entries.copy()
|
|
||||||
while len(entries_to_process) > 0:
|
|
||||||
entry = entries_to_process.pop()
|
|
||||||
|
|
||||||
if entry.entry_type == IliasElementType.EXTERNAL_LINK:
|
if entry.entry_type == IliasElementType.EXTERNAL_LINK:
|
||||||
PRETTY.not_searching(entry.path, "external link")
|
PRETTY.not_searching(entry.path, "external link")
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
if entry.entry_type == IliasElementType.FORUM:
|
if entry.entry_type == IliasElementType.FORUM:
|
||||||
PRETTY.not_searching(entry.path, "forum")
|
PRETTY.not_searching(entry.path, "forum")
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type):
|
if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type):
|
||||||
PRETTY.not_searching(entry.path, "user filter")
|
PRETTY.not_searching(entry.path, "user filter")
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
download_info = entry.to_download_info()
|
download_info = entry.to_download_info()
|
||||||
if download_info is not None:
|
if download_info is not None:
|
||||||
result.append(download_info)
|
result.append(download_info)
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url = entry.url()
|
url = await entry.url()
|
||||||
|
|
||||||
if url is None:
|
if url is None:
|
||||||
PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it")
|
PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it")
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
PRETTY.searching(entry.path)
|
PRETTY.searching(entry.path)
|
||||||
|
|
||||||
if entry.entry_type == IliasElementType.EXERCISE_FOLDER:
|
if entry.entry_type == IliasElementType.EXERCISE_FOLDER:
|
||||||
entries_to_process += self._crawl_exercises(entry.path, url)
|
for task in await self._crawl_exercises(entry.path, url):
|
||||||
|
entries_to_process.put_nowait(task)
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
if entry.entry_type == IliasElementType.REGULAR_FOLDER:
|
if entry.entry_type == IliasElementType.REGULAR_FOLDER:
|
||||||
entries_to_process += self._crawl_folder(entry.path, url)
|
for task in await self._crawl_folder(entry.path, url):
|
||||||
|
entries_to_process.put_nowait(task)
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
if entry.entry_type == IliasElementType.VIDEO_FOLDER:
|
if entry.entry_type == IliasElementType.VIDEO_FOLDER:
|
||||||
entries_to_process += self._crawl_video_directory(entry.path, url)
|
for task in await self._crawl_video_directory(entry.path, url):
|
||||||
|
entries_to_process.put_nowait(task)
|
||||||
|
entries_to_process.task_done()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
PRETTY.warning(f"Unknown type: {entry.entry_type}!")
|
PRETTY.warning(f"Unknown type: {entry.entry_type}!")
|
||||||
|
|
||||||
|
|
||||||
|
async def _iterate_entries_to_download_infos(
|
||||||
|
self,
|
||||||
|
entries: List[IliasCrawlerEntry]
|
||||||
|
) -> List[IliasDownloadInfo]:
|
||||||
|
result: List[IliasDownloadInfo] = []
|
||||||
|
crawl_queue = asyncio.Queue()
|
||||||
|
for entry in entries:
|
||||||
|
crawl_queue.put_nowait(entry)
|
||||||
|
|
||||||
|
workers = []
|
||||||
|
|
||||||
|
# TODO: Find proper worker limit
|
||||||
|
for _ in range(10):
|
||||||
|
worker = asyncio.create_task(self._crawl_worker(crawl_queue, result))
|
||||||
|
workers.append(worker)
|
||||||
|
|
||||||
|
await crawl_queue.join()
|
||||||
|
|
||||||
|
for worker in workers:
|
||||||
|
worker.cancel()
|
||||||
|
|
||||||
|
# Wait until all worker tasks are cancelled.
|
||||||
|
await asyncio.gather(*workers, return_exceptions=True)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
async def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||||
"""
|
"""
|
||||||
Crawl all files in a folder-like element.
|
Crawl all files in a folder-like element.
|
||||||
"""
|
"""
|
||||||
soup = self._get_page(url, {})
|
soup = await self._get_page(url, {})
|
||||||
|
|
||||||
if soup.find(id="headerimage"):
|
if soup.find(id="headerimage"):
|
||||||
element: bs4.Tag = soup.find(id="headerimage")
|
element: bs4.Tag = soup.find(id="headerimage")
|
||||||
|
|
@ -415,18 +449,18 @@ class IliasCrawler:
|
||||||
IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date)
|
IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date)
|
||||||
]
|
]
|
||||||
|
|
||||||
def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
async def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||||
"""
|
"""
|
||||||
Crawl the video overview site.
|
Crawl the video overview site.
|
||||||
"""
|
"""
|
||||||
initial_soup = self._get_page(url, {})
|
initial_soup = await self._get_page(url, {})
|
||||||
|
|
||||||
# The page is actually emtpy but contains a much needed token in the link below.
|
# The page is actually emtpy but contains a much needed token in the link below.
|
||||||
# That token can be used to fetch the *actual* video listing
|
# That token can be used to fetch the *actual* video listing
|
||||||
content_link: bs4.Tag = initial_soup.select_one("#tab_series a")
|
content_link: bs4.Tag = initial_soup.select_one("#tab_series a")
|
||||||
# Fetch the actual video listing. The given parameters return all videos (max 800)
|
# Fetch the actual video listing. The given parameters return all videos (max 800)
|
||||||
# in a standalone html page
|
# in a standalone html page
|
||||||
video_list_soup = self._get_page(
|
video_list_soup = await self._get_page(
|
||||||
self._abs_url_from_link(content_link),
|
self._abs_url_from_link(content_link),
|
||||||
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
)
|
)
|
||||||
|
|
@ -445,7 +479,7 @@ class IliasCrawler:
|
||||||
def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool:
|
def _is_paginated_video_page(soup: bs4.BeautifulSoup) -> bool:
|
||||||
return soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
return soup.find(id=re.compile(r"tab_page_sel.+")) is not None
|
||||||
|
|
||||||
def _crawl_paginated_video_directory(
|
async def _crawl_paginated_video_directory(
|
||||||
self,
|
self,
|
||||||
video_dir_path: Path,
|
video_dir_path: Path,
|
||||||
paged_video_list_soup: bs4.BeautifulSoup,
|
paged_video_list_soup: bs4.BeautifulSoup,
|
||||||
|
|
@ -475,7 +509,7 @@ class IliasCrawler:
|
||||||
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
||||||
table_id = match.group(1)
|
table_id = match.group(1)
|
||||||
|
|
||||||
extended_video_page = self._get_page(
|
extended_video_page = await self._get_page(
|
||||||
second_stage_url,
|
second_stage_url,
|
||||||
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||||
)
|
)
|
||||||
|
|
@ -557,14 +591,14 @@ class IliasCrawler:
|
||||||
modification_time
|
modification_time
|
||||||
)]
|
)]
|
||||||
|
|
||||||
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Optional[str]]:
|
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Awaitable[Optional[str]]]:
|
||||||
def inner() -> Optional[str]:
|
async def inner() -> Optional[str]:
|
||||||
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
||||||
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
||||||
# on the page, but defined in a JS object inside a script tag, passed to the player
|
# on the page, but defined in a JS object inside a script tag, passed to the player
|
||||||
# library.
|
# library.
|
||||||
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
|
# We do the impossible and RegEx the stream JSON object out of the page's HTML source
|
||||||
video_page_soup = soupify(self._client.get(play_url))
|
video_page_soup = soupify(await self._client.get(play_url))
|
||||||
regex: re.Pattern = re.compile(
|
regex: re.Pattern = re.compile(
|
||||||
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
|
r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE
|
||||||
)
|
)
|
||||||
|
|
@ -582,11 +616,11 @@ class IliasCrawler:
|
||||||
return video_url
|
return video_url
|
||||||
return inner
|
return inner
|
||||||
|
|
||||||
def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
async def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||||
"""
|
"""
|
||||||
Crawl files offered for download in exercises.
|
Crawl files offered for download in exercises.
|
||||||
"""
|
"""
|
||||||
soup = self._get_page(url, {})
|
soup = await self._get_page(url, {})
|
||||||
|
|
||||||
results: List[IliasCrawlerEntry] = []
|
results: List[IliasCrawlerEntry] = []
|
||||||
|
|
||||||
|
|
@ -626,7 +660,7 @@ class IliasCrawler:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@retry_on_io_exception(3, "fetching webpage")
|
@retry_on_io_exception(3, "fetching webpage")
|
||||||
def _get_page(self, url: str, params: Dict[str, Any],
|
async def _get_page(self, url: str, params: Dict[str, Any],
|
||||||
retry_count: int = 0) -> bs4.BeautifulSoup:
|
retry_count: int = 0) -> bs4.BeautifulSoup:
|
||||||
"""
|
"""
|
||||||
Fetches a page from ILIAS, authenticating when needed.
|
Fetches a page from ILIAS, authenticating when needed.
|
||||||
|
|
@ -639,7 +673,7 @@ class IliasCrawler:
|
||||||
|
|
||||||
LOGGER.debug("Fetching %r", url)
|
LOGGER.debug("Fetching %r", url)
|
||||||
|
|
||||||
response = self._client.get(url, params=params)
|
response = await self._client.get(url, params=params)
|
||||||
content_type = response.headers["content-type"]
|
content_type = response.headers["content-type"]
|
||||||
|
|
||||||
if not content_type.startswith("text/html"):
|
if not content_type.startswith("text/html"):
|
||||||
|
|
@ -657,7 +691,7 @@ class IliasCrawler:
|
||||||
|
|
||||||
self._authenticator.authenticate(self._client)
|
self._authenticator.authenticate(self._client)
|
||||||
|
|
||||||
return self._get_page(url, params, retry_count + 1)
|
return await self._get_page(url, params, retry_count + 1)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:
|
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:
|
||||||
|
|
|
||||||
|
|
@ -5,10 +5,11 @@ import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Callable, List, Optional, Union
|
from typing import Callable, Awaitable, List, Optional, Union
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import httpx
|
import httpx
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from ..errors import retry_on_io_exception
|
from ..errors import retry_on_io_exception
|
||||||
from ..logging import PrettyLogger
|
from ..logging import PrettyLogger
|
||||||
|
|
@ -34,13 +35,14 @@ class IliasDownloadInfo(Transformable):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
url: Union[str, Callable[[], Optional[str]]],
|
url: Union[str, Callable[[], Awaitable[Optional[str]]]],
|
||||||
modifcation_date: Optional[datetime.datetime]
|
modifcation_date: Optional[datetime.datetime]
|
||||||
):
|
):
|
||||||
super().__init__(path)
|
super().__init__(path)
|
||||||
if isinstance(url, str):
|
if isinstance(url, str):
|
||||||
string_url = url
|
future = asyncio.Future()
|
||||||
self.url: Callable[[], Optional[str]] = lambda: string_url
|
future.set_result(url)
|
||||||
|
self.url: Callable[[], Optional[str]] = lambda: future
|
||||||
else:
|
else:
|
||||||
self.url = url
|
self.url = url
|
||||||
self.modification_date = modifcation_date
|
self.modification_date = modifcation_date
|
||||||
|
|
@ -98,15 +100,15 @@ class IliasDownloader:
|
||||||
self._strategy = strategy
|
self._strategy = strategy
|
||||||
self._timeout = timeout
|
self._timeout = timeout
|
||||||
|
|
||||||
def download_all(self, infos: List[IliasDownloadInfo]) -> None:
|
async def download_all(self, infos: List[IliasDownloadInfo]) -> None:
|
||||||
"""
|
"""
|
||||||
Download multiple files one after the other.
|
Download multiple files one after the other.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for info in infos:
|
tasks = [self.download(info) for info in infos]
|
||||||
self.download(info)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
def download(self, info: IliasDownloadInfo) -> None:
|
async def download(self, info: IliasDownloadInfo) -> None:
|
||||||
"""
|
"""
|
||||||
Download a file from ILIAS.
|
Download a file from ILIAS.
|
||||||
|
|
||||||
|
|
@ -122,15 +124,15 @@ class IliasDownloader:
|
||||||
tmp_file = self._tmp_dir.new_path()
|
tmp_file = self._tmp_dir.new_path()
|
||||||
|
|
||||||
@retry_on_io_exception(3, "downloading file")
|
@retry_on_io_exception(3, "downloading file")
|
||||||
def download_impl() -> bool:
|
async def download_impl() -> bool:
|
||||||
if not self._try_download(info, tmp_file):
|
if not await self._try_download(info, tmp_file):
|
||||||
LOGGER.info("Re-Authenticating due to download failure: %r", info)
|
LOGGER.info("Re-Authenticating due to download failure: %r", info)
|
||||||
self._authenticator.authenticate(self._client)
|
self._authenticator.authenticate(self._client)
|
||||||
raise IOError("Scheduled retry")
|
raise IOError("Scheduled retry")
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if not download_impl():
|
if not await download_impl():
|
||||||
PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...")
|
PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -144,8 +146,8 @@ class IliasDownloader:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
|
async def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
|
||||||
url = info.url()
|
url = await info.url()
|
||||||
if url is None:
|
if url is None:
|
||||||
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
|
PRETTY.warning(f"Could not download {str(info.path)!r} as I got no URL :/")
|
||||||
return True
|
return True
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,8 @@ Convenience functions for using PFERD.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, List, Optional, Union
|
from typing import Callable, Awaitable, List, Optional, Union
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from .authenticators import UserPassAuthenticator
|
from .authenticators import UserPassAuthenticator
|
||||||
from .cookie_jar import CookieJar
|
from .cookie_jar import CookieJar
|
||||||
|
|
@ -72,11 +73,11 @@ class Pferd(Location):
|
||||||
inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password)
|
inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password)
|
||||||
return KitShibbolethAuthenticator(inner_auth)
|
return KitShibbolethAuthenticator(inner_auth)
|
||||||
|
|
||||||
def _ilias(
|
async def _ilias(
|
||||||
self,
|
self,
|
||||||
target: PathLike,
|
target: PathLike,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
crawl_function: Callable[[IliasCrawler], List[IliasDownloadInfo]],
|
crawl_function: Callable[[IliasCrawler], Awaitable[List[IliasDownloadInfo]]],
|
||||||
authenticator: IliasAuthenticator,
|
authenticator: IliasAuthenticator,
|
||||||
cookies: Optional[PathLike],
|
cookies: Optional[PathLike],
|
||||||
dir_filter: IliasDirectoryFilter,
|
dir_filter: IliasDirectoryFilter,
|
||||||
|
|
@ -89,28 +90,31 @@ class Pferd(Location):
|
||||||
# pylint: disable=too-many-locals
|
# pylint: disable=too-many-locals
|
||||||
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
|
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
|
||||||
client = cookie_jar.create_client()
|
client = cookie_jar.create_client()
|
||||||
|
async_client = cookie_jar.create_async_client()
|
||||||
tmp_dir = self._tmp_dir.new_subdir()
|
tmp_dir = self._tmp_dir.new_subdir()
|
||||||
organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)
|
organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)
|
||||||
|
|
||||||
crawler = IliasCrawler(base_url, client, authenticator, dir_filter)
|
crawler = IliasCrawler(base_url, async_client, authenticator, dir_filter)
|
||||||
downloader = IliasDownloader(tmp_dir, organizer, client,
|
downloader = IliasDownloader(tmp_dir, organizer, client,
|
||||||
authenticator, download_strategy, timeout)
|
authenticator, download_strategy, timeout)
|
||||||
|
|
||||||
cookie_jar.load_cookies()
|
cookie_jar.load_cookies()
|
||||||
info = crawl_function(crawler)
|
info = await crawl_function(crawler)
|
||||||
cookie_jar.save_cookies()
|
cookie_jar.save_cookies()
|
||||||
|
|
||||||
|
|
||||||
transformed = apply_transform(transform, info)
|
transformed = apply_transform(transform, info)
|
||||||
if self._test_run:
|
if self._test_run:
|
||||||
self._print_transformables(transformed)
|
self._print_transformables(transformed)
|
||||||
return organizer
|
return organizer
|
||||||
|
|
||||||
downloader.download_all(transformed)
|
await downloader.download_all(transformed)
|
||||||
cookie_jar.save_cookies()
|
cookie_jar.save_cookies()
|
||||||
|
|
||||||
if clean:
|
if clean:
|
||||||
organizer.cleanup()
|
organizer.cleanup()
|
||||||
|
|
||||||
|
await async_client.aclose()
|
||||||
return organizer
|
return organizer
|
||||||
|
|
||||||
@swallow_and_print_errors
|
@swallow_and_print_errors
|
||||||
|
|
@ -161,7 +165,7 @@ class Pferd(Location):
|
||||||
authenticator = Pferd._get_authenticator(username=username, password=password)
|
authenticator = Pferd._get_authenticator(username=username, password=password)
|
||||||
PRETTY.starting_synchronizer(target, "ILIAS", course_id)
|
PRETTY.starting_synchronizer(target, "ILIAS", course_id)
|
||||||
|
|
||||||
organizer = self._ilias(
|
organizer = asyncio.run(self._ilias(
|
||||||
target=target,
|
target=target,
|
||||||
base_url="https://ilias.studium.kit.edu/",
|
base_url="https://ilias.studium.kit.edu/",
|
||||||
crawl_function=lambda crawler: crawler.crawl_course(course_id),
|
crawl_function=lambda crawler: crawler.crawl_course(course_id),
|
||||||
|
|
@ -173,7 +177,7 @@ class Pferd(Location):
|
||||||
clean=clean,
|
clean=clean,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
file_conflict_resolver=file_conflict_resolver
|
file_conflict_resolver=file_conflict_resolver
|
||||||
)
|
))
|
||||||
|
|
||||||
self._download_summary.merge(organizer.download_summary)
|
self._download_summary.merge(organizer.download_summary)
|
||||||
|
|
||||||
|
|
@ -230,7 +234,7 @@ class Pferd(Location):
|
||||||
authenticator = Pferd._get_authenticator(username, password)
|
authenticator = Pferd._get_authenticator(username, password)
|
||||||
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
|
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
|
||||||
|
|
||||||
organizer = self._ilias(
|
organizer = asyncio.run(self._ilias(
|
||||||
target=target,
|
target=target,
|
||||||
base_url="https://ilias.studium.kit.edu/",
|
base_url="https://ilias.studium.kit.edu/",
|
||||||
crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
|
crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
|
||||||
|
|
@ -242,7 +246,7 @@ class Pferd(Location):
|
||||||
clean=clean,
|
clean=clean,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
file_conflict_resolver=file_conflict_resolver
|
file_conflict_resolver=file_conflict_resolver
|
||||||
)
|
))
|
||||||
|
|
||||||
self._download_summary.merge(organizer.download_summary)
|
self._download_summary.merge(organizer.download_summary)
|
||||||
|
|
||||||
|
|
@ -298,7 +302,7 @@ class Pferd(Location):
|
||||||
if not full_url.startswith("https://ilias.studium.kit.edu"):
|
if not full_url.startswith("https://ilias.studium.kit.edu"):
|
||||||
raise FatalException("Not a valid KIT ILIAS URL")
|
raise FatalException("Not a valid KIT ILIAS URL")
|
||||||
|
|
||||||
organizer = self._ilias(
|
organizer = asyncio.run(self._ilias(
|
||||||
target=target,
|
target=target,
|
||||||
base_url="https://ilias.studium.kit.edu/",
|
base_url="https://ilias.studium.kit.edu/",
|
||||||
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
|
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
|
||||||
|
|
@ -310,7 +314,7 @@ class Pferd(Location):
|
||||||
clean=clean,
|
clean=clean,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
file_conflict_resolver=file_conflict_resolver
|
file_conflict_resolver=file_conflict_resolver
|
||||||
)
|
))
|
||||||
|
|
||||||
self._download_summary.merge(organizer.download_summary)
|
self._download_summary.merge(organizer.download_summary)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue