mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-13 07:55:05 +02:00
Use shared ILIASCrawler
This commit is contained in:
parent
54a446c43c
commit
e3a13143bc
21 changed files with 736 additions and 510 deletions
|
|
@ -40,7 +40,7 @@ class CookieJar:
|
|||
except (FileNotFoundError, LoadError):
|
||||
LOGGER.warning(
|
||||
"No valid cookie file found at %s, continuing with no cookies",
|
||||
self._cookies.filename
|
||||
self._cookies.filename,
|
||||
)
|
||||
|
||||
def save_cookies(self, reason: Optional[str] = None) -> None:
|
||||
|
|
@ -69,6 +69,5 @@ class CookieJar:
|
|||
def create_async_client(self) -> httpx.AsyncClient:
|
||||
"""Create a new async client using the cookie jar."""
|
||||
# TODO: timeout=None was the default behaviour of requests. An approprite value should probably be set
|
||||
client = httpx.AsyncClient(timeout=None)
|
||||
client.cookies = self.cookies
|
||||
client = httpx.AsyncClient(timeout=None, cookies=self.cookies)
|
||||
return client
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ class DivaDownloadInfo(Transformable):
|
|||
"""
|
||||
Information about a DIVA video
|
||||
"""
|
||||
|
||||
url: str
|
||||
|
||||
|
||||
|
|
@ -49,7 +50,9 @@ class DivaPlaylistCrawler:
|
|||
"""
|
||||
|
||||
_PLAYLIST_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/detail/"
|
||||
_COLLECTION_BASE_URL = "https://mediaservice.bibliothek.kit.edu/asset/collection.json"
|
||||
_COLLECTION_BASE_URL = (
|
||||
"https://mediaservice.bibliothek.kit.edu/asset/collection.json"
|
||||
)
|
||||
|
||||
def __init__(self, playlist_id: str):
|
||||
self._id = playlist_id
|
||||
|
|
@ -108,15 +111,16 @@ class DivaPlaylistCrawler:
|
|||
title = video["title"]
|
||||
collection_title = self._follow_path(["collection", "title"], video)
|
||||
url = self._follow_path(
|
||||
["resourceList", "derivateList", "mp4", "url"],
|
||||
video
|
||||
["resourceList", "derivateList", "mp4", "url"], video
|
||||
)
|
||||
|
||||
if url and collection_title and title:
|
||||
path = Path(collection_title, title + ".mp4")
|
||||
download_infos.append(DivaDownloadInfo(path, url))
|
||||
else:
|
||||
PRETTY.warning(f"Incomplete video found: {title!r} {collection_title!r} {url!r}")
|
||||
PRETTY.warning(
|
||||
f"Incomplete video found: {title!r} {collection_title!r} {url!r}"
|
||||
)
|
||||
|
||||
return download_infos
|
||||
|
||||
|
|
@ -139,7 +143,9 @@ class DivaDownloader:
|
|||
A downloader for DIVA videos.
|
||||
"""
|
||||
|
||||
def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy):
|
||||
def __init__(
|
||||
self, tmp_dir: TmpDir, organizer: Organizer, strategy: DivaDownloadStrategy
|
||||
):
|
||||
self._tmp_dir = tmp_dir
|
||||
self._organizer = organizer
|
||||
self._strategy = strategy
|
||||
|
|
@ -166,4 +172,6 @@ class DivaDownloader:
|
|||
stream_to_path(response, tmp_file, info.path.name)
|
||||
self._organizer.accept_file(tmp_file, info.path)
|
||||
else:
|
||||
PRETTY.warning(f"Could not download file, got response {response.status_code}")
|
||||
PRETTY.warning(
|
||||
f"Could not download file, got response {response.status_code}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -42,13 +42,17 @@ class DownloadSummary:
|
|||
"""
|
||||
return self._deleted_files.copy()
|
||||
|
||||
def merge(self, summary: 'DownloadSummary') -> None:
|
||||
def merge(self, summary: "DownloadSummary") -> None:
|
||||
"""
|
||||
Merges ourselves with the passed summary. Modifies this object, but not the passed one.
|
||||
"""
|
||||
self._new_files = _mergeNoDuplicate(self._new_files, summary.new_files)
|
||||
self._modified_files = _mergeNoDuplicate(self._modified_files, summary.modified_files)
|
||||
self._deleted_files = _mergeNoDuplicate(self._deleted_files, summary.deleted_files)
|
||||
self._modified_files = _mergeNoDuplicate(
|
||||
self._modified_files, summary.modified_files
|
||||
)
|
||||
self._deleted_files = _mergeNoDuplicate(
|
||||
self._deleted_files, summary.deleted_files
|
||||
)
|
||||
|
||||
def add_deleted_file(self, path: Path) -> None:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -65,4 +65,6 @@ class HttpDownloader:
|
|||
self._organizer.accept_file(tmp_file, info.path)
|
||||
else:
|
||||
# TODO use proper exception
|
||||
raise Exception(f"Could not download file, got response {response.status_code}")
|
||||
raise Exception(
|
||||
f"Could not download file, got response {response.status_code}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -19,13 +19,14 @@ class FatalException(Exception):
|
|||
"""
|
||||
|
||||
|
||||
TFun = TypeVar('TFun', bound=Callable[..., Any])
|
||||
TFun = TypeVar("TFun", bound=Callable[..., Any])
|
||||
|
||||
|
||||
def swallow_and_print_errors(function: TFun) -> TFun:
|
||||
"""
|
||||
Decorates a function, swallows all errors, logs them and returns none if one occurred.
|
||||
"""
|
||||
|
||||
def inner(*args: Any, **kwargs: Any) -> Any:
|
||||
# pylint: disable=broad-except
|
||||
try:
|
||||
|
|
@ -36,6 +37,7 @@ def swallow_and_print_errors(function: TFun) -> TFun:
|
|||
except Exception as error:
|
||||
Console().print_exception()
|
||||
return None
|
||||
|
||||
return cast(TFun, inner)
|
||||
|
||||
|
||||
|
|
@ -43,6 +45,7 @@ def retry_on_io_exception(max_retries: int, message: str) -> Callable[[TFun], TF
|
|||
"""
|
||||
Decorates a function and retries it on any exception until the max retries count is hit.
|
||||
"""
|
||||
|
||||
def retry(function: TFun) -> TFun:
|
||||
def inner(*args: Any, **kwargs: Any) -> Any:
|
||||
for i in range(0, max_retries):
|
||||
|
|
@ -52,6 +55,9 @@ def retry_on_io_exception(max_retries: int, message: str) -> Callable[[TFun], TF
|
|||
except IOError as error:
|
||||
PRETTY.warning(f"Error duing operation '{message}': {error}")
|
||||
PRETTY.warning(
|
||||
f"Retrying operation '{message}'. Remaining retries: {max_retries - 1 - i}")
|
||||
f"Retrying operation '{message}'. Remaining retries: {max_retries - 1 - i}"
|
||||
)
|
||||
|
||||
return cast(TFun, inner)
|
||||
|
||||
return retry
|
||||
|
|
|
|||
|
|
@ -3,8 +3,18 @@ Synchronizing files from ILIAS instances (https://www.ilias.de/).
|
|||
"""
|
||||
|
||||
from .authenticators import IliasAuthenticator, KitShibbolethAuthenticator
|
||||
from .crawler import (IliasCrawler, IliasCrawlerEntry, IliasDirectoryFilter,
|
||||
IliasElementType)
|
||||
from .downloader import (IliasDownloader, IliasDownloadInfo,
|
||||
IliasDownloadStrategy, download_everything,
|
||||
download_modified_or_new)
|
||||
from .crawler import (
|
||||
IliasCrawler,
|
||||
IliasCrawlerEntry,
|
||||
IliasDirectoryFilter,
|
||||
IliasElementType,
|
||||
)
|
||||
from .downloader import (
|
||||
IliasDownloader,
|
||||
IliasDownloadInfo,
|
||||
IliasDownloadStrategy,
|
||||
download_everything,
|
||||
download_modified_or_new,
|
||||
)
|
||||
|
||||
from .syncronizer import IliasSycronizer, ResultContainer
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
|||
"_eventId_proceed": "",
|
||||
"j_username": self._auth.username,
|
||||
"j_password": self._auth.password,
|
||||
"csrf_token": csrf_token
|
||||
"csrf_token": csrf_token,
|
||||
}
|
||||
soup = soupify(await client.post(url, data=data))
|
||||
|
||||
|
|
@ -108,9 +108,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
|||
await client.post(url, data=data)
|
||||
|
||||
async def _authenticate_tfa(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
soup: bs4.BeautifulSoup
|
||||
self, client: httpx.AsyncClient, soup: bs4.BeautifulSoup
|
||||
) -> bs4.BeautifulSoup:
|
||||
# Searching the form here so that this fails before asking for
|
||||
# credentials rather than after asking.
|
||||
|
|
@ -121,10 +119,7 @@ class KitShibbolethAuthenticator(IliasAuthenticator):
|
|||
# https://idp.scc.kit.edu/idp/profile/SAML2/Redirect/SSO
|
||||
LOGGER.debug("Attempt to log in to Shibboleth with TFA token")
|
||||
url = "https://idp.scc.kit.edu" + action
|
||||
data = {
|
||||
"_eventId_proceed": "",
|
||||
"j_tokenNumber": self._tfa_auth.get_token()
|
||||
}
|
||||
data = {"_eventId_proceed": "", "j_tokenNumber": self._tfa_auth.get_token()}
|
||||
return soupify(await client.post(url, data=data))
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -2,20 +2,18 @@
|
|||
Contains an ILIAS crawler alongside helper functions.
|
||||
"""
|
||||
|
||||
from asyncio.queues import Queue
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Awaitable, Dict, List, Optional, Union
|
||||
from urllib.parse import (parse_qs, urlencode, urljoin, urlparse, urlsplit,
|
||||
urlunsplit)
|
||||
from typing import Any, Callable, Awaitable, Dict, List, Optional, Union, Tuple
|
||||
from urllib.parse import parse_qs, urlencode, urljoin, urlparse, urlsplit, urlunsplit
|
||||
|
||||
import asyncio
|
||||
import bs4
|
||||
import httpx
|
||||
import asyncio
|
||||
|
||||
from ..errors import FatalException, retry_on_io_exception
|
||||
from ..logging import PrettyLogger
|
||||
|
|
@ -32,10 +30,23 @@ def _sanitize_path_name(name: str) -> str:
|
|||
return name.replace("/", "-").replace("\\", "-")
|
||||
|
||||
|
||||
class ResultContainer:
|
||||
def __init__(self):
|
||||
self._results = []
|
||||
|
||||
def add_result(self, result: IliasDownloadInfo):
|
||||
self._results.append(result)
|
||||
|
||||
def get_results(self) -> List[IliasDownloadInfo]:
|
||||
return self._results
|
||||
|
||||
|
||||
class IliasElementType(Enum):
|
||||
"""
|
||||
The type of an ilias element.
|
||||
"""
|
||||
|
||||
COURSE = "COURSE"
|
||||
REGULAR_FOLDER = "REGULAR_FOLDER"
|
||||
VIDEO_FOLDER = "VIDEO_FOLDER"
|
||||
EXERCISE_FOLDER = "EXERCISE_FOLDER"
|
||||
|
|
@ -55,6 +66,17 @@ class IliasElementType(Enum):
|
|||
IliasDirectoryFilter = Callable[[Path, IliasElementType], bool]
|
||||
|
||||
|
||||
class InvalidCourseError(FatalException):
|
||||
"""
|
||||
A invalid Course ID was encountered
|
||||
"""
|
||||
|
||||
def __init__(course_id: str):
|
||||
super(
|
||||
f"Invalid course id {course_id}? I didn't find anything looking like a course!"
|
||||
)
|
||||
|
||||
|
||||
class IliasCrawlerEntry:
|
||||
# pylint: disable=too-few-public-methods
|
||||
"""
|
||||
|
|
@ -66,11 +88,10 @@ class IliasCrawlerEntry:
|
|||
path: Path,
|
||||
url: Union[str, Callable[[], Awaitable[Optional[str]]]],
|
||||
entry_type: IliasElementType,
|
||||
modification_date: Optional[datetime.datetime]
|
||||
modification_date: Optional[datetime.datetime],
|
||||
):
|
||||
self.path = path
|
||||
if isinstance(url, str):
|
||||
# TODO: Dirty hack, remove
|
||||
future = asyncio.Future()
|
||||
future.set_result(url)
|
||||
self.url: Callable[[], Awaitable[Optional[str]]] = lambda: future
|
||||
|
|
@ -84,7 +105,10 @@ class IliasCrawlerEntry:
|
|||
Converts this crawler entry to an IliasDownloadInfo, if possible.
|
||||
This method will only succeed for *File* types.
|
||||
"""
|
||||
if self.entry_type in [IliasElementType.REGULAR_FILE, IliasElementType.VIDEO_FILE]:
|
||||
if self.entry_type in [
|
||||
IliasElementType.REGULAR_FILE,
|
||||
IliasElementType.VIDEO_FILE,
|
||||
]:
|
||||
return IliasDownloadInfo(self.path, self.url, self.modification_date)
|
||||
return None
|
||||
|
||||
|
|
@ -102,12 +126,11 @@ class IliasCrawler:
|
|||
base_url: str,
|
||||
client: httpx.AsyncClient,
|
||||
authenticator: IliasAuthenticator,
|
||||
dir_filter: IliasDirectoryFilter
|
||||
dir_filter: IliasDirectoryFilter,
|
||||
):
|
||||
"""
|
||||
Create a new ILIAS crawler.
|
||||
"""
|
||||
|
||||
self._base_url = base_url
|
||||
self._client = client
|
||||
self._authenticator = authenticator
|
||||
|
|
@ -125,52 +148,31 @@ class IliasCrawler:
|
|||
|
||||
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
|
||||
|
||||
async def recursive_crawl_url(self, url: str) -> List[IliasDownloadInfo]:
|
||||
async def recursive_crawl_url(self, url: str) -> IliasCrawlerEntry:
|
||||
"""
|
||||
Crawls a given url *and all reachable elements in it*.
|
||||
Creates a crawl target for a given url *and all reachable elements in it*.
|
||||
|
||||
Args:
|
||||
url {str} -- the *full* url to crawl
|
||||
"""
|
||||
start_entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), url)
|
||||
return await self._iterate_entries_to_download_infos(start_entries)
|
||||
|
||||
async def crawl_course(self, course_id: str) -> List[IliasDownloadInfo]:
|
||||
return IliasCrawlerEntry(Path(""), url, IliasElementType.REGULAR_FOLDER, None)
|
||||
|
||||
async def crawl_course(self, course_id: str) -> IliasCrawlerEntry:
|
||||
"""
|
||||
Starts the crawl process for a course, yielding a list of elements to (potentially)
|
||||
Creates a crawl target for a course, yielding a list of elements to (potentially)
|
||||
download.
|
||||
|
||||
Arguments:
|
||||
course_id {str} -- the course id
|
||||
|
||||
Raises:
|
||||
FatalException: if an unrecoverable error occurs or the course id is not valid
|
||||
"""
|
||||
# Start crawling at the given course
|
||||
root_url = self._url_set_query_param(
|
||||
self._base_url + "/goto.php", "target", f"crs_{course_id}"
|
||||
)
|
||||
|
||||
if not await self._is_course_id_valid(root_url, course_id):
|
||||
raise FatalException(
|
||||
"Invalid course id? I didn't find anything looking like a course!"
|
||||
)
|
||||
|
||||
# And treat it as a folder
|
||||
entries: List[IliasCrawlerEntry] = await self._crawl_folder(Path(""), root_url)
|
||||
return await self._iterate_entries_to_download_infos(entries)
|
||||
|
||||
async def _is_course_id_valid(self, root_url: str, course_id: str) -> bool:
|
||||
response: httpx.Response = await self._client.get(root_url)
|
||||
# We were redirected ==> Non-existant ID
|
||||
if course_id not in str(response.url):
|
||||
return False
|
||||
|
||||
link_element: bs4.Tag = (await self._get_page(root_url, {})).find(id="current_perma_link")
|
||||
if not link_element:
|
||||
return False
|
||||
# It wasn't a course but a category list, forum, etc.
|
||||
return "crs_" in link_element.get("value")
|
||||
return IliasCrawlerEntry(Path(""), root_url, IliasElementType.COURSE, None)
|
||||
|
||||
async def find_course_name(self, course_id: str) -> Optional[str]:
|
||||
"""
|
||||
|
|
@ -186,26 +188,28 @@ class IliasCrawler:
|
|||
"""
|
||||
Returns the name of the element at the given URL, if it can find one.
|
||||
"""
|
||||
focus_element: bs4.Tag = await self._get_page(url, {}).find(id="il_mhead_t_focus")
|
||||
focus_element: bs4.Tag = await self._get_page(url, {}).find(
|
||||
id="il_mhead_t_focus"
|
||||
)
|
||||
if not focus_element:
|
||||
return None
|
||||
return focus_element.text
|
||||
|
||||
async def crawl_personal_desktop(self) -> List[IliasDownloadInfo]:
|
||||
async def crawl_personal_desktop(self) -> IliasCrawlerEntry:
|
||||
"""
|
||||
Crawls the ILIAS personal desktop (and every subelements that can be reached from there).
|
||||
|
||||
Raises:
|
||||
FatalException: if an unrecoverable error occurs
|
||||
Creates a crawl target for the ILIAS personal desktop (and every subelements that can be reached from there).
|
||||
download.
|
||||
"""
|
||||
entries: List[IliasCrawlerEntry] = await self._crawl_folder(
|
||||
Path(""), self._base_url + "?baseClass=ilPersonalDesktopGUI"
|
||||
return IliasCrawlerEntry(
|
||||
Path(""),
|
||||
self._base_url + "?baseClass=ilPersonalDesktopGUI",
|
||||
IliasElementType.REGULAR_FOLDER,
|
||||
None,
|
||||
)
|
||||
return await self._iterate_entries_to_download_infos(entries)
|
||||
|
||||
async def _crawl_worker(self, entries_to_process: asyncio.Queue, result: List[IliasDownloadInfo]):
|
||||
async def _crawl_worker(self, entries_to_process: asyncio.Queue):
|
||||
while True:
|
||||
entry = await entries_to_process.get()
|
||||
(entry, results) = await entries_to_process.get()
|
||||
|
||||
if entry.entry_type == IliasElementType.EXTERNAL_LINK:
|
||||
PRETTY.not_searching(entry.path, "external link")
|
||||
|
|
@ -216,21 +220,25 @@ class IliasCrawler:
|
|||
entries_to_process.task_done()
|
||||
continue
|
||||
|
||||
if entry.entry_type.is_folder() and not self.dir_filter(entry.path, entry.entry_type):
|
||||
if entry.entry_type.is_folder() and not self.dir_filter(
|
||||
entry.path, entry.entry_type
|
||||
):
|
||||
PRETTY.not_searching(entry.path, "user filter")
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
|
||||
download_info = entry.to_download_info()
|
||||
if download_info is not None:
|
||||
result.append(download_info)
|
||||
results.add_result(download_info)
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
|
||||
url = await entry.url()
|
||||
|
||||
if url is None:
|
||||
PRETTY.warning(f"Could not find url for {str(entry.path)!r}, skipping it")
|
||||
PRETTY.warning(
|
||||
f"Could not find url for {str(entry.path)!r}, skipping it"
|
||||
)
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
|
||||
|
|
@ -238,37 +246,46 @@ class IliasCrawler:
|
|||
|
||||
if entry.entry_type == IliasElementType.EXERCISE_FOLDER:
|
||||
for task in await self._crawl_exercises(entry.path, url):
|
||||
entries_to_process.put_nowait(task)
|
||||
entries_to_process.put_nowait((task, results))
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
if entry.entry_type == IliasElementType.REGULAR_FOLDER:
|
||||
for task in await self._crawl_folder(entry.path, url):
|
||||
entries_to_process.put_nowait(task)
|
||||
entries_to_process.put_nowait((task, results))
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
if entry.entry_type == IliasElementType.COURSE:
|
||||
for task in await self._crawl_folder(
|
||||
entry.path, url, url.split("crs_")[1]
|
||||
):
|
||||
entries_to_process.put_nowait((task, results))
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
if entry.entry_type == IliasElementType.VIDEO_FOLDER:
|
||||
for task in await self._crawl_video_directory(entry.path, url):
|
||||
entries_to_process.put_nowait(task)
|
||||
entries_to_process.put_nowait((task, results))
|
||||
entries_to_process.task_done()
|
||||
continue
|
||||
|
||||
PRETTY.warning(f"Unknown type: {entry.entry_type}!")
|
||||
|
||||
|
||||
async def _iterate_entries_to_download_infos(
|
||||
self,
|
||||
entries: List[IliasCrawlerEntry]
|
||||
) -> List[IliasDownloadInfo]:
|
||||
result: List[IliasDownloadInfo] = []
|
||||
async def iterate_entries_to_download_infos(
|
||||
self, entries: List[Tuple[IliasCrawlerEntry, ResultContainer]]
|
||||
):
|
||||
crawl_queue = asyncio.Queue()
|
||||
|
||||
# Setup authentication locks
|
||||
self._auth_event = asyncio.Event()
|
||||
self._auth_lock = asyncio.Lock()
|
||||
|
||||
for entry in entries:
|
||||
crawl_queue.put_nowait(entry)
|
||||
|
||||
workers = []
|
||||
|
||||
# TODO: Find proper worker limit
|
||||
for _ in range(10):
|
||||
worker = asyncio.create_task(self._crawl_worker(crawl_queue, result))
|
||||
for _ in range(20):
|
||||
worker = asyncio.create_task(self._crawl_worker(crawl_queue))
|
||||
workers.append(worker)
|
||||
|
||||
await crawl_queue.join()
|
||||
|
|
@ -278,13 +295,22 @@ class IliasCrawler:
|
|||
|
||||
# Wait until all worker tasks are cancelled.
|
||||
await asyncio.gather(*workers, return_exceptions=True)
|
||||
return result
|
||||
|
||||
async def _crawl_folder(self, folder_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||
async def _crawl_folder(
|
||||
self, folder_path: Path, url: str, course: Optional[str] = None
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawl all files in a folder-like element.
|
||||
|
||||
Raises a InvalidCourseError if the folder is a non existent course.
|
||||
"""
|
||||
soup = await self._get_page(url, {})
|
||||
soup = await self._get_page(url, {}, check_course_id_valid=course)
|
||||
|
||||
if course is not None:
|
||||
link_element: bs4.Tag = soup.find(id="current_perma_link")
|
||||
# It wasn't a course but a category list, forum, etc.
|
||||
if not link_element or "crs_" not in link_element.get("value"):
|
||||
raise InvalidCourseError(course)
|
||||
|
||||
if soup.find(id="headerimage"):
|
||||
element: bs4.Tag = soup.find(id="headerimage")
|
||||
|
|
@ -301,7 +327,9 @@ class IliasCrawler:
|
|||
links: List[bs4.Tag] = soup.select("a.il_ContainerItemTitle")
|
||||
for link in links:
|
||||
abs_url = self._abs_url_from_link(link)
|
||||
element_path = Path(folder_path, _sanitize_path_name(link.getText().strip()))
|
||||
element_path = Path(
|
||||
folder_path, _sanitize_path_name(link.getText().strip())
|
||||
)
|
||||
element_type = self._find_type_from_link(element_path, link, abs_url)
|
||||
|
||||
if element_type == IliasElementType.REGULAR_FILE:
|
||||
|
|
@ -312,18 +340,24 @@ class IliasCrawler:
|
|||
date_portion = demangle_date(date_portion_str)
|
||||
|
||||
if not date_portion:
|
||||
result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)]
|
||||
result += [
|
||||
IliasCrawlerEntry(element_path, abs_url, element_type, None)
|
||||
]
|
||||
continue
|
||||
|
||||
rest_of_name = meeting_name
|
||||
if rest_of_name.startswith(date_portion_str):
|
||||
rest_of_name = rest_of_name[len(date_portion_str):]
|
||||
rest_of_name = rest_of_name[len(date_portion_str) :]
|
||||
|
||||
new_name = datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M") \
|
||||
new_name = (
|
||||
datetime.datetime.strftime(date_portion, "%Y-%m-%d, %H:%M")
|
||||
+ rest_of_name
|
||||
)
|
||||
new_path = Path(folder_path, _sanitize_path_name(new_name))
|
||||
result += [
|
||||
IliasCrawlerEntry(new_path, abs_url, IliasElementType.REGULAR_FOLDER, None)
|
||||
IliasCrawlerEntry(
|
||||
new_path, abs_url, IliasElementType.REGULAR_FOLDER, None
|
||||
)
|
||||
]
|
||||
elif element_type is not None:
|
||||
result += [IliasCrawlerEntry(element_path, abs_url, element_type, None)]
|
||||
|
|
@ -340,9 +374,7 @@ class IliasCrawler:
|
|||
|
||||
@staticmethod
|
||||
def _find_type_from_link(
|
||||
path: Path,
|
||||
link_element: bs4.Tag,
|
||||
url: str
|
||||
path: Path, link_element: bs4.Tag, url: str
|
||||
) -> Optional[IliasElementType]:
|
||||
"""
|
||||
Decides which sub crawler to use for a given top level element.
|
||||
|
|
@ -370,7 +402,9 @@ class IliasCrawler:
|
|||
return None
|
||||
|
||||
@staticmethod
|
||||
def _find_type_from_folder_like(link_element: bs4.Tag, url: str) -> Optional[IliasElementType]:
|
||||
def _find_type_from_folder_like(
|
||||
link_element: bs4.Tag, url: str
|
||||
) -> Optional[IliasElementType]:
|
||||
"""
|
||||
Try crawling something that looks like a folder.
|
||||
"""
|
||||
|
|
@ -414,7 +448,9 @@ class IliasCrawler:
|
|||
return IliasElementType.REGULAR_FOLDER
|
||||
|
||||
@staticmethod
|
||||
def _crawl_file(path: Path, link_element: bs4.Tag, url: str) -> List[IliasCrawlerEntry]:
|
||||
def _crawl_file(
|
||||
path: Path, link_element: bs4.Tag, url: str
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawls a file.
|
||||
"""
|
||||
|
|
@ -425,14 +461,16 @@ class IliasCrawler:
|
|||
"div", {"class": lambda x: "il_ContainerListItem" in x}
|
||||
).select_one(".il_ItemProperties")
|
||||
# The first one is always the filetype
|
||||
file_type = properties_parent.select_one("span.il_ItemProperty").getText().strip()
|
||||
file_type = (
|
||||
properties_parent.select_one("span.il_ItemProperty").getText().strip()
|
||||
)
|
||||
|
||||
# The rest does not have a stable order. Grab the whole text and reg-ex the date
|
||||
# out of it
|
||||
all_properties_text = properties_parent.getText().strip()
|
||||
modification_date_match = re.search(
|
||||
r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
|
||||
all_properties_text
|
||||
all_properties_text,
|
||||
)
|
||||
if modification_date_match is None:
|
||||
modification_date = None
|
||||
|
|
@ -446,10 +484,14 @@ class IliasCrawler:
|
|||
full_path = Path(path, name + "." + file_type)
|
||||
|
||||
return [
|
||||
IliasCrawlerEntry(full_path, url, IliasElementType.REGULAR_FILE, modification_date)
|
||||
IliasCrawlerEntry(
|
||||
full_path, url, IliasElementType.REGULAR_FILE, modification_date
|
||||
)
|
||||
]
|
||||
|
||||
async def _crawl_video_directory(self, video_dir_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||
async def _crawl_video_directory(
|
||||
self, video_dir_path: Path, url: str
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawl the video overview site.
|
||||
"""
|
||||
|
|
@ -462,7 +504,7 @@ class IliasCrawler:
|
|||
# in a standalone html page
|
||||
video_list_soup = await self._get_page(
|
||||
self._abs_url_from_link(content_link),
|
||||
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||
{"limit": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"},
|
||||
)
|
||||
|
||||
# If we find a page selected, we probably need to respect pagination
|
||||
|
|
@ -483,7 +525,7 @@ class IliasCrawler:
|
|||
self,
|
||||
video_dir_path: Path,
|
||||
paged_video_list_soup: bs4.BeautifulSoup,
|
||||
second_stage_url: str
|
||||
second_stage_url: str,
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
LOGGER.info("Found paginated video page, trying 800 elements")
|
||||
|
||||
|
|
@ -498,7 +540,9 @@ class IliasCrawler:
|
|||
"Could not increase elements per page (table not found)."
|
||||
" Some might not be crawled!"
|
||||
)
|
||||
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
||||
return self._crawl_video_directory_second_stage(
|
||||
video_dir_path, paged_video_list_soup
|
||||
)
|
||||
|
||||
match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"])
|
||||
if match is None:
|
||||
|
|
@ -506,12 +550,18 @@ class IliasCrawler:
|
|||
"Could not increase elements per page (table id not found)."
|
||||
" Some might not be crawled!"
|
||||
)
|
||||
return self._crawl_video_directory_second_stage(video_dir_path, paged_video_list_soup)
|
||||
return self._crawl_video_directory_second_stage(
|
||||
video_dir_path, paged_video_list_soup
|
||||
)
|
||||
table_id = match.group(1)
|
||||
|
||||
extended_video_page = await self._get_page(
|
||||
second_stage_url,
|
||||
{f"tbl_xoct_{table_id}_trows": 800, "cmd": "asyncGetTableGUI", "cmdMode": "asynch"}
|
||||
{
|
||||
f"tbl_xoct_{table_id}_trows": 800,
|
||||
"cmd": "asyncGetTableGUI",
|
||||
"cmdMode": "asynch",
|
||||
},
|
||||
)
|
||||
|
||||
if self._is_paginated_video_page(extended_video_page):
|
||||
|
|
@ -520,12 +570,12 @@ class IliasCrawler:
|
|||
" I will miss elements."
|
||||
)
|
||||
|
||||
return self._crawl_video_directory_second_stage(video_dir_path, extended_video_page)
|
||||
return self._crawl_video_directory_second_stage(
|
||||
video_dir_path, extended_video_page
|
||||
)
|
||||
|
||||
def _crawl_video_directory_second_stage(
|
||||
self,
|
||||
video_dir_path: Path,
|
||||
video_list_soup: bs4.BeautifulSoup
|
||||
self, video_dir_path: Path, video_list_soup: bs4.BeautifulSoup
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawls the "second stage" video page. This page contains the actual video urls.
|
||||
|
|
@ -553,24 +603,27 @@ class IliasCrawler:
|
|||
return results
|
||||
|
||||
def _crawl_single_video(
|
||||
self,
|
||||
parent_path: Path,
|
||||
link: bs4.Tag,
|
||||
direct_download: bool
|
||||
self, parent_path: Path, link: bs4.Tag, direct_download: bool
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawl a single video based on its "Abspielen" link from the video listing.
|
||||
"""
|
||||
# The link is part of a table with multiple columns, describing metadata.
|
||||
# 6th child (1 indexed) is the modification time string
|
||||
modification_string = link.parent.parent.parent.select_one(
|
||||
"td.std:nth-child(6)"
|
||||
).getText().strip()
|
||||
modification_time = datetime.datetime.strptime(modification_string, "%d.%m.%Y - %H:%M")
|
||||
modification_string = (
|
||||
link.parent.parent.parent.select_one("td.std:nth-child(6)")
|
||||
.getText()
|
||||
.strip()
|
||||
)
|
||||
modification_time = datetime.datetime.strptime(
|
||||
modification_string, "%d.%m.%Y - %H:%M"
|
||||
)
|
||||
|
||||
title = link.parent.parent.parent.select_one(
|
||||
"td.std:nth-child(3)"
|
||||
).getText().strip()
|
||||
title = (
|
||||
link.parent.parent.parent.select_one("td.std:nth-child(3)")
|
||||
.getText()
|
||||
.strip()
|
||||
)
|
||||
title += ".mp4"
|
||||
|
||||
video_path: Path = Path(parent_path, _sanitize_path_name(title))
|
||||
|
|
@ -580,18 +633,27 @@ class IliasCrawler:
|
|||
# The video had a direct download button we can use instead
|
||||
if direct_download:
|
||||
LOGGER.debug("Using direct download for video %r", str(video_path))
|
||||
return [IliasCrawlerEntry(
|
||||
video_path, video_url, IliasElementType.VIDEO_FILE, modification_time
|
||||
)]
|
||||
return [
|
||||
IliasCrawlerEntry(
|
||||
video_path,
|
||||
video_url,
|
||||
IliasElementType.VIDEO_FILE,
|
||||
modification_time,
|
||||
)
|
||||
]
|
||||
|
||||
return [IliasCrawlerEntry(
|
||||
return [
|
||||
IliasCrawlerEntry(
|
||||
video_path,
|
||||
self._crawl_video_url_from_play_link(video_url),
|
||||
IliasElementType.VIDEO_FILE,
|
||||
modification_time
|
||||
)]
|
||||
modification_time,
|
||||
)
|
||||
]
|
||||
|
||||
def _crawl_video_url_from_play_link(self, play_url: str) -> Callable[[], Awaitable[Optional[str]]]:
|
||||
def _crawl_video_url_from_play_link(
|
||||
self, play_url: str
|
||||
) -> Callable[[], Awaitable[Optional[str]]]:
|
||||
async def inner() -> Optional[str]:
|
||||
# Fetch the actual video page. This is a small wrapper page initializing a javscript
|
||||
# player. Sadly we can not execute that JS. The actual video stream url is nowhere
|
||||
|
|
@ -614,9 +676,12 @@ class IliasCrawler:
|
|||
# and fetch the video url!
|
||||
video_url = json_object["streams"][0]["sources"]["mp4"][0]["src"]
|
||||
return video_url
|
||||
|
||||
return inner
|
||||
|
||||
async def _crawl_exercises(self, element_path: Path, url: str) -> List[IliasCrawlerEntry]:
|
||||
async def _crawl_exercises(
|
||||
self, element_path: Path, url: str
|
||||
) -> List[IliasCrawlerEntry]:
|
||||
"""
|
||||
Crawl files offered for download in exercises.
|
||||
"""
|
||||
|
|
@ -625,17 +690,21 @@ class IliasCrawler:
|
|||
results: List[IliasCrawlerEntry] = []
|
||||
|
||||
# Each assignment is in an accordion container
|
||||
assignment_containers: List[bs4.Tag] = soup.select(".il_VAccordionInnerContainer")
|
||||
assignment_containers: List[bs4.Tag] = soup.select(
|
||||
".il_VAccordionInnerContainer"
|
||||
)
|
||||
|
||||
for container in assignment_containers:
|
||||
# Fetch the container name out of the header to use it in the path
|
||||
container_name = container.select_one(".ilAssignmentHeader").getText().strip()
|
||||
container_name = (
|
||||
container.select_one(".ilAssignmentHeader").getText().strip()
|
||||
)
|
||||
# Find all download links in the container (this will contain all the files)
|
||||
files: List[bs4.Tag] = container.findAll(
|
||||
name="a",
|
||||
# download links contain the given command class
|
||||
attrs={"href": lambda x: x and "cmdClass=ilexsubmissiongui" in x},
|
||||
text="Download"
|
||||
text="Download",
|
||||
)
|
||||
|
||||
LOGGER.debug("Found exercise container %r", container_name)
|
||||
|
|
@ -650,30 +719,47 @@ class IliasCrawler:
|
|||
|
||||
LOGGER.debug("Found file %r at %r", file_name, url)
|
||||
|
||||
results.append(IliasCrawlerEntry(
|
||||
results.append(
|
||||
IliasCrawlerEntry(
|
||||
Path(element_path, container_name, file_name),
|
||||
url,
|
||||
IliasElementType.REGULAR_FILE,
|
||||
None # We do not have any timestamp
|
||||
))
|
||||
None, # We do not have any timestamp
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
@retry_on_io_exception(3, "fetching webpage")
|
||||
async def _get_page(self, url: str, params: Dict[str, Any],
|
||||
retry_count: int = 0) -> bs4.BeautifulSoup:
|
||||
async def _get_page(
|
||||
self,
|
||||
url: str,
|
||||
params: Dict[str, Any],
|
||||
retry_count: int = 0,
|
||||
check_course_id_valid: Optional[str] = None,
|
||||
) -> bs4.BeautifulSoup:
|
||||
"""
|
||||
Fetches a page from ILIAS, authenticating when needed.
|
||||
|
||||
Raises a InvalidCourseError if the page is a non existent course.
|
||||
"""
|
||||
|
||||
if retry_count >= 4:
|
||||
raise FatalException("Could not get a proper page after 4 tries. "
|
||||
raise FatalException(
|
||||
"Could not get a proper page after 4 tries. "
|
||||
"Maybe your URL is wrong, authentication fails continuously, "
|
||||
"your ILIAS connection is spotty or ILIAS is not well.")
|
||||
"your ILIAS connection is spotty or ILIAS is not well."
|
||||
)
|
||||
|
||||
LOGGER.debug("Fetching %r", url)
|
||||
|
||||
response = await self._client.get(url, params=params)
|
||||
|
||||
if check_course_id_valid is not None:
|
||||
# We were redirected ==> Non-existant ID
|
||||
if check_course_id_valid not in str(response.url):
|
||||
raise InvalidCourseError(check_course_id_valid)
|
||||
|
||||
content_type = response.headers["content-type"]
|
||||
|
||||
if not content_type.startswith("text/html"):
|
||||
|
|
@ -687,11 +773,23 @@ class IliasCrawler:
|
|||
if self._is_logged_in(soup):
|
||||
return soup
|
||||
|
||||
if self._auth_lock.locked():
|
||||
# Some other future is already logging in
|
||||
await self._auth_event.wait()
|
||||
else:
|
||||
await self._auth_lock.acquire()
|
||||
self._auth_event.clear()
|
||||
LOGGER.info("Not authenticated, changing that...")
|
||||
|
||||
await self._authenticator.authenticate(self._client)
|
||||
self._auth_event.set()
|
||||
self._auth_lock.release()
|
||||
|
||||
return await self._get_page(url, params, retry_count + 1)
|
||||
return await self._get_page(
|
||||
url,
|
||||
params,
|
||||
check_course_id_valid=check_course_id_valid,
|
||||
retry_count=retry_count + 1,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_logged_in(soup: bs4.BeautifulSoup) -> bool:
|
||||
|
|
@ -705,7 +803,7 @@ class IliasCrawler:
|
|||
video_table = soup.find(
|
||||
recursive=True,
|
||||
name="table",
|
||||
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")}
|
||||
attrs={"id": lambda x: x is not None and x.startswith("tbl_xoct")},
|
||||
)
|
||||
if video_table is not None:
|
||||
LOGGER.debug("Auth: Found #tbl_xoct.+")
|
||||
|
|
|
|||
|
|
@ -25,15 +25,19 @@ def demangle_date(date: str) -> Optional[datetime.datetime]:
|
|||
saved = locale.setlocale(locale.LC_ALL)
|
||||
try:
|
||||
try:
|
||||
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
|
||||
locale.setlocale(locale.LC_ALL, "de_DE.UTF-8")
|
||||
except locale.Error:
|
||||
PRETTY.warning(
|
||||
"Could not set language to german. Assuming you use english everywhere."
|
||||
)
|
||||
|
||||
date = re.sub(r"\s+", " ", date)
|
||||
date = re.sub("Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I)
|
||||
date = re.sub("Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I)
|
||||
date = re.sub(
|
||||
"Gestern|Yesterday", _yesterday().strftime("%d. %b %Y"), date, re.I
|
||||
)
|
||||
date = re.sub(
|
||||
"Heute|Today", datetime.date.today().strftime("%d. %b %Y"), date, re.I
|
||||
)
|
||||
date = re.sub("Morgen|Tomorrow", _tomorrow().strftime("%d. %b %Y"), date, re.I)
|
||||
return datetime.datetime.strptime(date, "%d. %b %Y, %H:%M")
|
||||
except ValueError:
|
||||
|
|
|
|||
|
|
@ -7,9 +7,9 @@ import os
|
|||
from pathlib import Path, PurePath
|
||||
from typing import Callable, Awaitable, List, Optional, Union
|
||||
|
||||
import asyncio
|
||||
import bs4
|
||||
import httpx
|
||||
import asyncio
|
||||
|
||||
from ..errors import retry_on_io_exception
|
||||
from ..logging import PrettyLogger
|
||||
|
|
@ -36,7 +36,7 @@ class IliasDownloadInfo(Transformable):
|
|||
self,
|
||||
path: PurePath,
|
||||
url: Union[str, Callable[[], Awaitable[Optional[str]]]],
|
||||
modifcation_date: Optional[datetime.datetime]
|
||||
modifcation_date: Optional[datetime.datetime],
|
||||
):
|
||||
super().__init__(path)
|
||||
if isinstance(url, str):
|
||||
|
|
@ -87,7 +87,7 @@ class IliasDownloader:
|
|||
client: httpx.Client,
|
||||
authenticator: IliasAuthenticator,
|
||||
strategy: IliasDownloadStrategy,
|
||||
timeout: int = 5
|
||||
timeout: int = 5,
|
||||
):
|
||||
"""
|
||||
Create a new IliasDownloader.
|
||||
|
|
@ -133,7 +133,9 @@ class IliasDownloader:
|
|||
return True
|
||||
|
||||
if not await download_impl():
|
||||
PRETTY.error(f"Download of file {info.path} failed too often! Skipping it...")
|
||||
PRETTY.error(
|
||||
f"Download of file {info.path} failed too often! Skipping it..."
|
||||
)
|
||||
return
|
||||
|
||||
dst_path = self._organizer.accept_file(tmp_file, info.path)
|
||||
|
|
@ -142,8 +144,8 @@ class IliasDownloader:
|
|||
dst_path,
|
||||
times=(
|
||||
math.ceil(info.modification_date.timestamp()),
|
||||
math.ceil(info.modification_date.timestamp())
|
||||
)
|
||||
math.ceil(info.modification_date.timestamp()),
|
||||
),
|
||||
)
|
||||
|
||||
async def _try_download(self, info: IliasDownloadInfo, target: Path) -> bool:
|
||||
|
|
@ -158,7 +160,9 @@ class IliasDownloader:
|
|||
|
||||
if content_type.startswith("text/html") and not has_content_disposition:
|
||||
if self._is_logged_in(soupify(response)):
|
||||
raise ContentTypeException("Attempting to download a web page, not a file")
|
||||
raise ContentTypeException(
|
||||
"Attempting to download a web page, not a file"
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
|
|
|
|||
80
PFERD/ilias/syncronizer.py
Normal file
80
PFERD/ilias/syncronizer.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
from typing import Callable, Awaitable, List, Optional
|
||||
|
||||
from .authenticators import IliasAuthenticator
|
||||
from .crawler import (
|
||||
IliasCrawler,
|
||||
IliasDirectoryFilter,
|
||||
IliasCrawlerEntry,
|
||||
ResultContainer,
|
||||
)
|
||||
|
||||
from ..utils import PathLike, to_path
|
||||
from ..cookie_jar import CookieJar
|
||||
|
||||
|
||||
class IliasSycronizer:
|
||||
"""
|
||||
This class is used to manage a ILIAS Crawler
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
authenticator: IliasAuthenticator,
|
||||
cookies: Optional[PathLike],
|
||||
dir_filter: IliasDirectoryFilter,
|
||||
):
|
||||
self._cookie_jar = CookieJar(to_path(cookies) if cookies else None)
|
||||
self._cookie_jar.load_cookies()
|
||||
self._authenticator = authenticator
|
||||
|
||||
self._client = self._cookie_jar.create_async_client()
|
||||
|
||||
self._crawler = IliasCrawler(
|
||||
base_url, self._client, self._authenticator, dir_filter
|
||||
)
|
||||
self._targets = []
|
||||
|
||||
def add_target(
|
||||
self,
|
||||
crawl_function: Callable[[IliasCrawler], Awaitable[List[IliasCrawlerEntry]]],
|
||||
) -> ResultContainer:
|
||||
"""
|
||||
Adds a crawl target and returns the ResultContainer, in which DownloadInfos will be saved
|
||||
|
||||
Arguments:
|
||||
crawl_function {Callable[[IliasCrawler], Awaitable[List[IliasCrawlerEntry]]]} -- a callback which should return an awaitable list of IliasCrawlerEntrys
|
||||
"""
|
||||
results = ResultContainer()
|
||||
self._targets.append((crawl_function, results))
|
||||
return results
|
||||
|
||||
def get_authenticator(self):
|
||||
"""
|
||||
Returns the associated authenticator
|
||||
"""
|
||||
return self._authenticator
|
||||
|
||||
def get_cookie_jar(self):
|
||||
"""
|
||||
Returns the associated cookie jar
|
||||
"""
|
||||
return self._cookie_jar
|
||||
|
||||
async def close_client(self):
|
||||
"""
|
||||
Closes the async client
|
||||
"""
|
||||
await self._client.aclose()
|
||||
|
||||
async def syncronize(self):
|
||||
"""
|
||||
Syncronizes all registered targets
|
||||
"""
|
||||
# Populate initial targets
|
||||
entries = []
|
||||
for (crawl_function, results) in self._targets:
|
||||
entries.append((await crawl_function(self._crawler), results))
|
||||
|
||||
await self._crawler.iterate_entries_to_download_infos(entries)
|
||||
self._cookie_jar.save_cookies()
|
||||
36
PFERD/ipd.py
36
PFERD/ipd.py
|
|
@ -31,6 +31,7 @@ class IpdDownloadInfo(Transformable):
|
|||
"""
|
||||
Information about an ipd entry.
|
||||
"""
|
||||
|
||||
url: str
|
||||
modification_date: Optional[datetime.datetime]
|
||||
|
||||
|
|
@ -83,9 +84,16 @@ class IpdCrawler:
|
|||
items: List[IpdDownloadInfo] = []
|
||||
|
||||
def is_relevant_url(x: str) -> bool:
|
||||
return x.endswith(".pdf") or x.endswith(".c") or x.endswith(".java") or x.endswith(".zip")
|
||||
return (
|
||||
x.endswith(".pdf")
|
||||
or x.endswith(".c")
|
||||
or x.endswith(".java")
|
||||
or x.endswith(".zip")
|
||||
)
|
||||
|
||||
for link in page.findAll(name="a", attrs={"href": lambda x: x and is_relevant_url(x)}):
|
||||
for link in page.findAll(
|
||||
name="a", attrs={"href": lambda x: x and is_relevant_url(x)}
|
||||
):
|
||||
href: str = link.attrs.get("href")
|
||||
name = href.split("/")[-1]
|
||||
|
||||
|
|
@ -94,15 +102,19 @@ class IpdCrawler:
|
|||
enclosing_row: bs4.Tag = link.findParent(name="tr")
|
||||
if enclosing_row:
|
||||
date_text = enclosing_row.find(name="td").text
|
||||
modification_date = datetime.datetime.strptime(date_text, "%d.%m.%Y")
|
||||
modification_date = datetime.datetime.strptime(
|
||||
date_text, "%d.%m.%Y"
|
||||
)
|
||||
except ValueError:
|
||||
modification_date = None
|
||||
|
||||
items.append(IpdDownloadInfo(
|
||||
items.append(
|
||||
IpdDownloadInfo(
|
||||
Path(name),
|
||||
url=self._abs_url_from_link(link),
|
||||
modification_date=modification_date
|
||||
))
|
||||
modification_date=modification_date,
|
||||
)
|
||||
)
|
||||
|
||||
return items
|
||||
|
||||
|
|
@ -112,7 +124,9 @@ class IpdDownloader:
|
|||
A downloader for ipd files.
|
||||
"""
|
||||
|
||||
def __init__(self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy):
|
||||
def __init__(
|
||||
self, tmp_dir: TmpDir, organizer: Organizer, strategy: IpdDownloadStrategy
|
||||
):
|
||||
self._tmp_dir = tmp_dir
|
||||
self._organizer = organizer
|
||||
self._strategy = strategy
|
||||
|
|
@ -144,11 +158,13 @@ class IpdDownloader:
|
|||
dst_path,
|
||||
times=(
|
||||
math.ceil(info.modification_date.timestamp()),
|
||||
math.ceil(info.modification_date.timestamp())
|
||||
)
|
||||
math.ceil(info.modification_date.timestamp()),
|
||||
),
|
||||
)
|
||||
|
||||
elif response.status_code == 403:
|
||||
raise FatalException("Received 403. Are you not using the KIT VPN?")
|
||||
else:
|
||||
PRETTY.warning(f"Could not download file, got response {response.status_code}")
|
||||
PRETTY.warning(
|
||||
f"Could not download file, got response {response.status_code}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from pathlib import Path, PurePath
|
|||
|
||||
class ResolveException(Exception):
|
||||
"""An exception while resolving a file."""
|
||||
|
||||
# TODO take care of this when doing exception handling
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -40,9 +40,9 @@ class RichLoggingHandler(logging.Handler):
|
|||
|
||||
def __init__(self, level: int) -> None:
|
||||
super().__init__(level=level)
|
||||
self.console = Console(theme=Theme({
|
||||
"logging.level.warning": Style(color="yellow")
|
||||
}))
|
||||
self.console = Console(
|
||||
theme=Theme({"logging.level.warning": Style(color="yellow")})
|
||||
)
|
||||
self._log_render = LogRender(show_level=True, show_time=False, show_path=False)
|
||||
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
|
|
@ -81,18 +81,14 @@ class PrettyLogger:
|
|||
"""
|
||||
Print an error message indicating some operation fatally failed.
|
||||
"""
|
||||
self.logger.error(
|
||||
f"[bold red]{message}[/bold red]"
|
||||
)
|
||||
self.logger.error(f"[bold red]{message}[/bold red]")
|
||||
|
||||
def warning(self, message: str) -> None:
|
||||
"""
|
||||
Print a warning message indicating some operation failed, but the error can be recovered
|
||||
or ignored.
|
||||
"""
|
||||
self.logger.warning(
|
||||
f"[bold yellow]{message}[/bold yellow]"
|
||||
)
|
||||
self.logger.warning(f"[bold yellow]{message}[/bold yellow]")
|
||||
|
||||
def modified_file(self, path: PathLike) -> None:
|
||||
"""
|
||||
|
|
@ -108,18 +104,14 @@ class PrettyLogger:
|
|||
A new file has been downloaded.
|
||||
"""
|
||||
|
||||
self.logger.info(
|
||||
f"[bold green]Created {self._format_path(path)}.[/bold green]"
|
||||
)
|
||||
self.logger.info(f"[bold green]Created {self._format_path(path)}.[/bold green]")
|
||||
|
||||
def deleted_file(self, path: PathLike) -> None:
|
||||
"""
|
||||
A file has been deleted.
|
||||
"""
|
||||
|
||||
self.logger.info(
|
||||
f"[bold red]Deleted {self._format_path(path)}.[/bold red]"
|
||||
)
|
||||
self.logger.info(f"[bold red]Deleted {self._format_path(path)}.[/bold red]")
|
||||
|
||||
def ignored_file(self, path: PathLike, reason: str) -> None:
|
||||
"""
|
||||
|
|
@ -127,8 +119,7 @@ class PrettyLogger:
|
|||
"""
|
||||
|
||||
self.logger.info(
|
||||
f"[dim]Ignored {self._format_path(path)} "
|
||||
f"([/dim]{reason}[dim]).[/dim]"
|
||||
f"[dim]Ignored {self._format_path(path)} " f"([/dim]{reason}[dim]).[/dim]"
|
||||
)
|
||||
|
||||
def searching(self, path: PathLike) -> None:
|
||||
|
|
@ -177,8 +168,10 @@ class PrettyLogger:
|
|||
|
||||
subject_str = f"{subject} " if subject else ""
|
||||
self.logger.info("")
|
||||
self.logger.info((
|
||||
self.logger.info(
|
||||
(
|
||||
f"[bold cyan]Synchronizing "
|
||||
f"{subject_str}to {self._format_path(target_directory)} "
|
||||
f"using the {synchronizer_name} synchronizer.[/bold cyan]"
|
||||
))
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ class ConflictType(Enum):
|
|||
MARKED_FILE_OVERWRITTEN: A file is written for the second+ time in this run
|
||||
FILE_DELETED: The file was deleted
|
||||
"""
|
||||
|
||||
FILE_OVERWRITTEN = "overwritten"
|
||||
MARKED_FILE_OVERWRITTEN = "marked_file_overwritten"
|
||||
FILE_DELETED = "deleted"
|
||||
|
|
@ -56,7 +57,9 @@ class FileConflictResolution(Enum):
|
|||
FileConflictResolver = Callable[[PurePath, ConflictType], FileConflictResolution]
|
||||
|
||||
|
||||
def resolve_prompt_user(_path: PurePath, conflict: ConflictType) -> FileConflictResolution:
|
||||
def resolve_prompt_user(
|
||||
_path: PurePath, conflict: ConflictType
|
||||
) -> FileConflictResolution:
|
||||
"""
|
||||
Resolves conflicts by asking the user if a file was written twice or will be deleted.
|
||||
"""
|
||||
|
|
@ -72,7 +75,9 @@ class FileAcceptException(Exception):
|
|||
class Organizer(Location):
|
||||
"""A helper for managing downloaded files."""
|
||||
|
||||
def __init__(self, path: Path, conflict_resolver: FileConflictResolver = resolve_prompt_user):
|
||||
def __init__(
|
||||
self, path: Path, conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
):
|
||||
"""Create a new organizer for a given path."""
|
||||
super().__init__(path)
|
||||
self._known_files: Set[Path] = set()
|
||||
|
|
@ -98,7 +103,7 @@ class Organizer(Location):
|
|||
# your path...
|
||||
# See:
|
||||
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#maximum-path-length-limitation
|
||||
if os.name == 'nt':
|
||||
if os.name == "nt":
|
||||
src_absolute = Path("\\\\?\\" + str(src.resolve()))
|
||||
dst_absolute = Path("\\\\?\\" + str(self.resolve(dst)))
|
||||
else:
|
||||
|
|
@ -116,7 +121,9 @@ class Organizer(Location):
|
|||
if self._is_marked(dst):
|
||||
PRETTY.warning(f"File {str(dst_absolute)!r} was already written!")
|
||||
conflict = ConflictType.MARKED_FILE_OVERWRITTEN
|
||||
if self._resolve_conflict("Overwrite file?", dst_absolute, conflict, default=False):
|
||||
if self._resolve_conflict(
|
||||
"Overwrite file?", dst_absolute, conflict, default=False
|
||||
):
|
||||
PRETTY.ignored_file(dst_absolute, "file was written previously")
|
||||
return None
|
||||
|
||||
|
|
@ -201,7 +208,9 @@ class Organizer(Location):
|
|||
def _delete_file_if_confirmed(self, path: Path) -> None:
|
||||
prompt = f"Do you want to delete {path}"
|
||||
|
||||
if self._resolve_conflict(prompt, path, ConflictType.FILE_DELETED, default=False):
|
||||
if self._resolve_conflict(
|
||||
prompt, path, ConflictType.FILE_DELETED, default=False
|
||||
):
|
||||
self.download_summary.add_deleted_file(path)
|
||||
path.unlink()
|
||||
else:
|
||||
|
|
|
|||
418
PFERD/pferd.py
418
PFERD/pferd.py
|
|
@ -4,20 +4,35 @@ Convenience functions for using PFERD.
|
|||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Callable, Awaitable, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
import asyncio
|
||||
|
||||
from .authenticators import UserPassAuthenticator
|
||||
from .cookie_jar import CookieJar
|
||||
from .diva import (DivaDownloader, DivaDownloadStrategy, DivaPlaylistCrawler,
|
||||
diva_download_new)
|
||||
from .diva import (
|
||||
DivaDownloader,
|
||||
DivaDownloadStrategy,
|
||||
DivaPlaylistCrawler,
|
||||
diva_download_new,
|
||||
)
|
||||
from .download_summary import DownloadSummary
|
||||
from .errors import FatalException, swallow_and_print_errors
|
||||
from .ilias import (IliasAuthenticator, IliasCrawler, IliasDirectoryFilter,
|
||||
IliasDownloader, IliasDownloadInfo, IliasDownloadStrategy,
|
||||
KitShibbolethAuthenticator, download_modified_or_new)
|
||||
from .ipd import (IpdCrawler, IpdDownloader, IpdDownloadInfo,
|
||||
IpdDownloadStrategy, ipd_download_new_or_modified)
|
||||
from .ilias import (
|
||||
IliasDirectoryFilter,
|
||||
IliasDownloader,
|
||||
IliasDownloadInfo,
|
||||
IliasDownloadStrategy,
|
||||
KitShibbolethAuthenticator,
|
||||
download_modified_or_new,
|
||||
IliasSycronizer,
|
||||
ResultContainer,
|
||||
)
|
||||
from .ipd import (
|
||||
IpdCrawler,
|
||||
IpdDownloader,
|
||||
IpdDownloadInfo,
|
||||
IpdDownloadStrategy,
|
||||
ipd_download_new_or_modified,
|
||||
)
|
||||
from .location import Location
|
||||
from .logging import PrettyLogger, enable_logging
|
||||
from .organizer import FileConflictResolver, Organizer, resolve_prompt_user
|
||||
|
|
@ -32,6 +47,36 @@ LOGGER = logging.getLogger(__name__)
|
|||
PRETTY = PrettyLogger(LOGGER)
|
||||
|
||||
|
||||
class IliasTarget:
|
||||
"""
|
||||
Used to store associated options for a crawl target and hold the a reference to the results container
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
results: ResultContainer,
|
||||
target: PathLike,
|
||||
transform: Transform = lambda x: x,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user,
|
||||
):
|
||||
self.results = results
|
||||
self.target = target
|
||||
self.transform = transform
|
||||
self.download_strategy = download_strategy
|
||||
self.clean = clean
|
||||
self.timeout = timeout
|
||||
self.file_conflict_resolver = file_conflict_resolver
|
||||
|
||||
def get_results(self) -> List[IliasDownloadInfo]:
|
||||
"""
|
||||
Returns the results of the associated crawl target
|
||||
"""
|
||||
return self.results.get_results()
|
||||
|
||||
|
||||
class Pferd(Location):
|
||||
# pylint: disable=too-many-arguments
|
||||
"""
|
||||
|
|
@ -40,16 +85,14 @@ class Pferd(Location):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_dir: Path,
|
||||
tmp_dir: Path = Path(".tmp"),
|
||||
test_run: bool = False
|
||||
self, base_dir: Path, tmp_dir: Path = Path(".tmp"), test_run: bool = False
|
||||
):
|
||||
super().__init__(Path(base_dir))
|
||||
|
||||
self._download_summary = DownloadSummary()
|
||||
self._tmp_dir = TmpDir(self.resolve(tmp_dir))
|
||||
self._test_run = test_run
|
||||
self._ilias_targets: List[IliasTarget] = []
|
||||
|
||||
@staticmethod
|
||||
def enable_logging() -> None:
|
||||
|
|
@ -73,114 +116,169 @@ class Pferd(Location):
|
|||
inner_auth = UserPassAuthenticator("ILIAS - Pferd.py", username, password)
|
||||
return KitShibbolethAuthenticator(inner_auth)
|
||||
|
||||
async def _ilias(
|
||||
self,
|
||||
target: PathLike,
|
||||
base_url: str,
|
||||
crawl_function: Callable[[IliasCrawler], Awaitable[List[IliasDownloadInfo]]],
|
||||
authenticator: IliasAuthenticator,
|
||||
cookies: Optional[PathLike],
|
||||
dir_filter: IliasDirectoryFilter,
|
||||
transform: Transform,
|
||||
download_strategy: IliasDownloadStrategy,
|
||||
timeout: int,
|
||||
clean: bool = True,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
) -> Organizer:
|
||||
# pylint: disable=too-many-locals
|
||||
cookie_jar = CookieJar(to_path(cookies) if cookies else None)
|
||||
client = cookie_jar.create_client()
|
||||
async_client = cookie_jar.create_async_client()
|
||||
tmp_dir = self._tmp_dir.new_subdir()
|
||||
organizer = Organizer(self.resolve(to_path(target)), file_conflict_resolver)
|
||||
|
||||
crawler = IliasCrawler(base_url, async_client, authenticator, dir_filter)
|
||||
downloader = IliasDownloader(tmp_dir, organizer, client,
|
||||
authenticator, download_strategy, timeout)
|
||||
|
||||
cookie_jar.load_cookies()
|
||||
info = await crawl_function(crawler)
|
||||
cookie_jar.save_cookies()
|
||||
|
||||
|
||||
transformed = apply_transform(transform, info)
|
||||
if self._test_run:
|
||||
self._print_transformables(transformed)
|
||||
return organizer
|
||||
|
||||
await downloader.download_all(transformed)
|
||||
cookie_jar.save_cookies()
|
||||
|
||||
if clean:
|
||||
organizer.cleanup()
|
||||
|
||||
await async_client.aclose()
|
||||
return organizer
|
||||
|
||||
@swallow_and_print_errors
|
||||
def ilias_kit(
|
||||
self,
|
||||
target: PathLike,
|
||||
course_id: str,
|
||||
dir_filter: IliasDirectoryFilter = lambda x, y: True,
|
||||
transform: Transform = lambda x: x,
|
||||
cookies: Optional[PathLike] = None,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
) -> Organizer:
|
||||
) -> IliasSycronizer:
|
||||
"""
|
||||
Synchronizes a folder with the ILIAS instance of the KIT.
|
||||
|
||||
Arguments:
|
||||
target {Path} -- the target path to write the data to
|
||||
course_id {str} -- the id of the main course page (found in the URL after ref_id
|
||||
when opening the course homepage)
|
||||
Create a ILIAS Sycronizer for the ILIAS instance of the KIT.
|
||||
|
||||
Keyword Arguments:
|
||||
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
|
||||
crawler level, these directories and all of their content is skipped.
|
||||
(default: {lambdax:True})
|
||||
transform {Transform} -- A transformation function for the output paths. Return None
|
||||
to ignore a file. (default: {lambdax:x})
|
||||
cookies {Optional[Path]} -- The path to store and load cookies from.
|
||||
(default: {None})
|
||||
username {Optional[str]} -- The SCC username. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
password {Optional[str]} -- The SCC password. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
"""
|
||||
|
||||
# This authenticator only works with the KIT ilias instance.
|
||||
authenticator = Pferd._get_authenticator(username=username, password=password)
|
||||
return IliasSycronizer(
|
||||
"https://ilias.studium.kit.edu/", authenticator, cookies, dir_filter
|
||||
)
|
||||
# TODO: Format crawler output
|
||||
##PRETTY.starting_synchronizer(target, "ILIAS", course_id)
|
||||
|
||||
def add_ilias_personal_desktop(
|
||||
self,
|
||||
ilias: IliasSycronizer,
|
||||
target: PathLike,
|
||||
transform: Transform = lambda x: x,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user,
|
||||
):
|
||||
"""
|
||||
Add the ILIAS "personal desktop" as a crawl target.
|
||||
Arguments:
|
||||
ilias {IliasSycronizer} -- the ILIAS Instance
|
||||
target {Path} -- the target path to write the data to
|
||||
Keyword Arguments:
|
||||
transform {Transform} -- A transformation function for the output paths. Return None
|
||||
to ignore a file. (default: {lambdax:x})
|
||||
download_strategy {DownloadStrategy} -- A function to determine which files need to
|
||||
be downloaded. Can save bandwidth and reduce the number of requests.
|
||||
(default: {download_modified_or_new})
|
||||
clean {bool} -- Whether to clean up when the method finishes.
|
||||
timeout {int} -- The download timeout for opencast videos.
|
||||
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
|
||||
requests bug.
|
||||
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
|
||||
with overwriting or deleting files. The default always asks the user.
|
||||
"""
|
||||
# This authenticator only works with the KIT ilias instance.
|
||||
authenticator = Pferd._get_authenticator(username=username, password=password)
|
||||
PRETTY.starting_synchronizer(target, "ILIAS", course_id)
|
||||
results = ilias.add_target(
|
||||
lambda crawler: crawler.crawl_personal_desktop(),
|
||||
)
|
||||
target = IliasTarget(
|
||||
results,
|
||||
target,
|
||||
transform,
|
||||
download_strategy,
|
||||
clean,
|
||||
timeout,
|
||||
file_conflict_resolver,
|
||||
)
|
||||
self._ilias_targets.append(target)
|
||||
|
||||
organizer = asyncio.run(self._ilias(
|
||||
target=target,
|
||||
base_url="https://ilias.studium.kit.edu/",
|
||||
crawl_function=lambda crawler: crawler.crawl_course(course_id),
|
||||
authenticator=authenticator,
|
||||
cookies=cookies,
|
||||
dir_filter=dir_filter,
|
||||
transform=transform,
|
||||
download_strategy=download_strategy,
|
||||
clean=clean,
|
||||
timeout=timeout,
|
||||
file_conflict_resolver=file_conflict_resolver
|
||||
))
|
||||
def add_ilias_folder(
|
||||
self,
|
||||
ilias: IliasSycronizer,
|
||||
target: PathLike,
|
||||
course_id: str,
|
||||
transform: Transform = lambda x: x,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user,
|
||||
):
|
||||
"""
|
||||
Add a course to syncronize
|
||||
|
||||
Arguments:
|
||||
ilias {IliasSycronizer} -- the ILIAS Instance
|
||||
target {Path} -- the target path to write the data to
|
||||
course_id {str} -- the id of the main course page (found in the URL after ref_id
|
||||
when opening the course homepage)
|
||||
Keyword Arguments:
|
||||
transform {Transform} -- A transformation function for the output paths. Return None
|
||||
to ignore a file. (default: {lambdax:x})
|
||||
download_strategy {DownloadStrategy} -- A function to determine which files need to
|
||||
be downloaded. Can save bandwidth and reduce the number of requests.
|
||||
(default: {download_modified_or_new})
|
||||
clean {bool} -- Whether to clean up when the method finishes.
|
||||
timeout {int} -- The download timeout for opencast videos. Sadly needed due to a
|
||||
requests bug.
|
||||
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
|
||||
with overwriting or deleting files. The default always asks the user.
|
||||
"""
|
||||
|
||||
results = ilias.add_target(
|
||||
lambda crawler: crawler.crawl_course(course_id),
|
||||
)
|
||||
target = IliasTarget(
|
||||
results,
|
||||
target,
|
||||
transform,
|
||||
download_strategy,
|
||||
clean,
|
||||
timeout,
|
||||
file_conflict_resolver,
|
||||
)
|
||||
self._ilias_targets.append(target)
|
||||
|
||||
async def _syncronize_ilias(self, ilias: IliasSycronizer):
|
||||
await ilias.syncronize()
|
||||
|
||||
cookie_jar = ilias.get_cookie_jar()
|
||||
cookie_jar.save_cookies()
|
||||
authenticator = ilias.get_authenticator()
|
||||
|
||||
client = cookie_jar.create_client()
|
||||
for entry in self._ilias_targets:
|
||||
tmp_dir = self._tmp_dir.new_subdir()
|
||||
organizer = Organizer(
|
||||
self.resolve(to_path(entry.target)), entry.file_conflict_resolver
|
||||
)
|
||||
|
||||
downloader = IliasDownloader(
|
||||
tmp_dir,
|
||||
organizer,
|
||||
client,
|
||||
authenticator,
|
||||
entry.download_strategy,
|
||||
entry.timeout,
|
||||
)
|
||||
|
||||
transformed = apply_transform(entry.transform, entry.get_results())
|
||||
if self._test_run:
|
||||
self._print_transformables(transformed)
|
||||
return organizer
|
||||
|
||||
await downloader.download_all(transformed)
|
||||
|
||||
if entry.clean:
|
||||
organizer.cleanup()
|
||||
|
||||
self._download_summary.merge(organizer.download_summary)
|
||||
|
||||
return organizer
|
||||
await ilias.close_client()
|
||||
|
||||
def syncronize_ilias(self, ilias: IliasSycronizer):
|
||||
"""
|
||||
Syncronize a given ilias instance
|
||||
|
||||
Arguments:
|
||||
ilias {IliasSycronizer} -- the ILIAS Instance
|
||||
"""
|
||||
asyncio.run(self._syncronize_ilias(ilias))
|
||||
|
||||
def print_summary(self) -> None:
|
||||
"""
|
||||
|
|
@ -188,136 +286,6 @@ class Pferd(Location):
|
|||
"""
|
||||
PRETTY.summary(self._download_summary)
|
||||
|
||||
@swallow_and_print_errors
|
||||
def ilias_kit_personal_desktop(
|
||||
self,
|
||||
target: PathLike,
|
||||
dir_filter: IliasDirectoryFilter = lambda x, y: True,
|
||||
transform: Transform = lambda x: x,
|
||||
cookies: Optional[PathLike] = None,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
) -> Organizer:
|
||||
"""
|
||||
Synchronizes a folder with the ILIAS instance of the KIT. This method will crawl the ILIAS
|
||||
"personal desktop" instead of a single course.
|
||||
|
||||
Arguments:
|
||||
target {Path} -- the target path to write the data to
|
||||
|
||||
Keyword Arguments:
|
||||
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
|
||||
crawler level, these directories and all of their content is skipped.
|
||||
(default: {lambdax:True})
|
||||
transform {Transform} -- A transformation function for the output paths. Return None
|
||||
to ignore a file. (default: {lambdax:x})
|
||||
cookies {Optional[Path]} -- The path to store and load cookies from.
|
||||
(default: {None})
|
||||
username {Optional[str]} -- The SCC username. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
password {Optional[str]} -- The SCC password. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
download_strategy {DownloadStrategy} -- A function to determine which files need to
|
||||
be downloaded. Can save bandwidth and reduce the number of requests.
|
||||
(default: {download_modified_or_new})
|
||||
clean {bool} -- Whether to clean up when the method finishes.
|
||||
timeout {int} -- The download timeout for opencast videos.
|
||||
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
|
||||
with overwriting or deleting files. The default always asks the user.
|
||||
"""
|
||||
# This authenticator only works with the KIT ilias instance.
|
||||
authenticator = Pferd._get_authenticator(username, password)
|
||||
PRETTY.starting_synchronizer(target, "ILIAS", "Personal Desktop")
|
||||
|
||||
organizer = asyncio.run(self._ilias(
|
||||
target=target,
|
||||
base_url="https://ilias.studium.kit.edu/",
|
||||
crawl_function=lambda crawler: crawler.crawl_personal_desktop(),
|
||||
authenticator=authenticator,
|
||||
cookies=cookies,
|
||||
dir_filter=dir_filter,
|
||||
transform=transform,
|
||||
download_strategy=download_strategy,
|
||||
clean=clean,
|
||||
timeout=timeout,
|
||||
file_conflict_resolver=file_conflict_resolver
|
||||
))
|
||||
|
||||
self._download_summary.merge(organizer.download_summary)
|
||||
|
||||
return organizer
|
||||
|
||||
@swallow_and_print_errors
|
||||
def ilias_kit_folder(
|
||||
self,
|
||||
target: PathLike,
|
||||
full_url: str,
|
||||
dir_filter: IliasDirectoryFilter = lambda x, y: True,
|
||||
transform: Transform = lambda x: x,
|
||||
cookies: Optional[PathLike] = None,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
download_strategy: IliasDownloadStrategy = download_modified_or_new,
|
||||
clean: bool = True,
|
||||
timeout: int = 5,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
) -> Organizer:
|
||||
"""
|
||||
Synchronizes a folder with a given folder on the ILIAS instance of the KIT.
|
||||
|
||||
Arguments:
|
||||
target {Path} -- the target path to write the data to
|
||||
full_url {str} -- the full url of the folder/videos/course to crawl
|
||||
|
||||
Keyword Arguments:
|
||||
dir_filter {IliasDirectoryFilter} -- A filter for directories. Will be applied on the
|
||||
crawler level, these directories and all of their content is skipped.
|
||||
(default: {lambdax:True})
|
||||
transform {Transform} -- A transformation function for the output paths. Return None
|
||||
to ignore a file. (default: {lambdax:x})
|
||||
cookies {Optional[Path]} -- The path to store and load cookies from.
|
||||
(default: {None})
|
||||
username {Optional[str]} -- The SCC username. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
password {Optional[str]} -- The SCC password. If none is given, it will prompt
|
||||
the user. (default: {None})
|
||||
download_strategy {DownloadStrategy} -- A function to determine which files need to
|
||||
be downloaded. Can save bandwidth and reduce the number of requests.
|
||||
(default: {download_modified_or_new})
|
||||
clean {bool} -- Whether to clean up when the method finishes.
|
||||
timeout {int} -- The download timeout for opencast videos.
|
||||
file_conflict_resolver {FileConflictResolver} -- A function specifying how to deal
|
||||
with overwriting or deleting files. The default always asks the user.
|
||||
"""
|
||||
# This authenticator only works with the KIT ilias instance.
|
||||
authenticator = Pferd._get_authenticator(username=username, password=password)
|
||||
PRETTY.starting_synchronizer(target, "ILIAS", "An ILIAS element by url")
|
||||
|
||||
if not full_url.startswith("https://ilias.studium.kit.edu"):
|
||||
raise FatalException("Not a valid KIT ILIAS URL")
|
||||
|
||||
organizer = asyncio.run(self._ilias(
|
||||
target=target,
|
||||
base_url="https://ilias.studium.kit.edu/",
|
||||
crawl_function=lambda crawler: crawler.recursive_crawl_url(full_url),
|
||||
authenticator=authenticator,
|
||||
cookies=cookies,
|
||||
dir_filter=dir_filter,
|
||||
transform=transform,
|
||||
download_strategy=download_strategy,
|
||||
clean=clean,
|
||||
timeout=timeout,
|
||||
file_conflict_resolver=file_conflict_resolver
|
||||
))
|
||||
|
||||
self._download_summary.merge(organizer.download_summary)
|
||||
|
||||
return organizer
|
||||
|
||||
@swallow_and_print_errors
|
||||
def ipd_kit(
|
||||
self,
|
||||
|
|
@ -326,7 +294,7 @@ class Pferd(Location):
|
|||
transform: Transform = lambda x: x,
|
||||
download_strategy: IpdDownloadStrategy = ipd_download_new_or_modified,
|
||||
clean: bool = True,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user,
|
||||
) -> Organizer:
|
||||
"""
|
||||
Synchronizes a folder with a DIVA playlist.
|
||||
|
|
@ -365,7 +333,9 @@ class Pferd(Location):
|
|||
self._print_transformables(transformed)
|
||||
return organizer
|
||||
|
||||
downloader = IpdDownloader(tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy)
|
||||
downloader = IpdDownloader(
|
||||
tmp_dir=tmp_dir, organizer=organizer, strategy=download_strategy
|
||||
)
|
||||
downloader.download_all(transformed)
|
||||
|
||||
if clean:
|
||||
|
|
@ -383,7 +353,7 @@ class Pferd(Location):
|
|||
transform: Transform = lambda x: x,
|
||||
download_strategy: DivaDownloadStrategy = diva_download_new,
|
||||
clean: bool = True,
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user
|
||||
file_conflict_resolver: FileConflictResolver = resolve_prompt_user,
|
||||
) -> Organizer:
|
||||
"""
|
||||
Synchronizes a folder with a DIVA playlist.
|
||||
|
|
|
|||
|
|
@ -8,9 +8,15 @@ from typing import Optional, Type
|
|||
|
||||
import httpx
|
||||
from rich.console import Console
|
||||
from rich.progress import (BarColumn, DownloadColumn, Progress, TaskID,
|
||||
TextColumn, TimeRemainingColumn,
|
||||
TransferSpeedColumn)
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
DownloadColumn,
|
||||
Progress,
|
||||
TaskID,
|
||||
TextColumn,
|
||||
TimeRemainingColumn,
|
||||
TransferSpeedColumn,
|
||||
)
|
||||
|
||||
_progress: Progress = Progress(
|
||||
TextColumn("[bold blue]{task.fields[name]}", justify="right"),
|
||||
|
|
@ -23,7 +29,7 @@ _progress: Progress = Progress(
|
|||
"•",
|
||||
TimeRemainingColumn(),
|
||||
console=Console(file=sys.stdout),
|
||||
transient=True
|
||||
transient=True,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -47,11 +53,12 @@ class ProgressSettings:
|
|||
"""
|
||||
Settings you can pass to customize the progress bar.
|
||||
"""
|
||||
|
||||
name: str
|
||||
max_size: int
|
||||
|
||||
|
||||
def progress_for(settings: Optional[ProgressSettings]) -> 'ProgressContextManager':
|
||||
def progress_for(settings: Optional[ProgressSettings]) -> "ProgressContextManager":
|
||||
"""
|
||||
Returns a context manager that displays progress
|
||||
|
||||
|
|
@ -70,16 +77,14 @@ class ProgressContextManager:
|
|||
self._settings = settings
|
||||
self._task_id: Optional[TaskID] = None
|
||||
|
||||
def __enter__(self) -> 'ProgressContextManager':
|
||||
def __enter__(self) -> "ProgressContextManager":
|
||||
"""Context manager entry function."""
|
||||
if not self._settings:
|
||||
return self
|
||||
|
||||
_progress.start()
|
||||
self._task_id = _progress.add_task(
|
||||
self._settings.name,
|
||||
total=self._settings.max_size,
|
||||
name=self._settings.name
|
||||
self._settings.name, total=self._settings.max_size, name=self._settings.name
|
||||
)
|
||||
return self
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ class TmpDir(Location):
|
|||
"""Format the folder as a string."""
|
||||
return f"Folder at {self.path}"
|
||||
|
||||
def __enter__(self) -> 'TmpDir':
|
||||
def __enter__(self) -> "TmpDir":
|
||||
"""Context manager entry function."""
|
||||
return self
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ class TmpDir(Location):
|
|||
|
||||
return self.resolve(Path(name))
|
||||
|
||||
def new_subdir(self, prefix: Optional[str] = None) -> 'TmpDir':
|
||||
def new_subdir(self, prefix: Optional[str] = None) -> "TmpDir":
|
||||
"""
|
||||
Create a new nested temporary folder and return it.
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -45,11 +45,14 @@ def apply_transform(
|
|||
result.append(transformable)
|
||||
return result
|
||||
|
||||
|
||||
# Transform combinators
|
||||
|
||||
|
||||
def keep(path: PurePath) -> Optional[PurePath]:
|
||||
return path
|
||||
|
||||
|
||||
def attempt(*args: Transform) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
for transform in args:
|
||||
|
|
@ -57,11 +60,14 @@ def attempt(*args: Transform) -> Transform:
|
|||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def optionally(transform: Transform) -> Transform:
|
||||
return attempt(transform, lambda path: path)
|
||||
|
||||
|
||||
def do(*args: Transform) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
current = path
|
||||
|
|
@ -72,43 +78,56 @@ def do(*args: Transform) -> Transform:
|
|||
else:
|
||||
return None
|
||||
return current
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def predicate(pred: Callable[[PurePath], bool]) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
if pred(path):
|
||||
return path
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def glob(pattern: str) -> Transform:
|
||||
return predicate(lambda path: path.match(pattern))
|
||||
|
||||
|
||||
def move_dir(source_dir: PathLike, target_dir: PathLike) -> Transform:
|
||||
source_path = to_path(source_dir)
|
||||
target_path = to_path(target_dir)
|
||||
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
if source_path in path.parents:
|
||||
return target_path / path.relative_to(source_path)
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def move(source: PathLike, target: PathLike) -> Transform:
|
||||
source_path = to_path(source)
|
||||
target_path = to_path(target)
|
||||
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
if path == source_path:
|
||||
return target_path
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def rename(source: str, target: str) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
if path.name == source:
|
||||
return path.with_name(target)
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def re_move(regex: Regex, target: str) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
match = to_pattern(regex).fullmatch(str(path))
|
||||
|
|
@ -117,8 +136,10 @@ def re_move(regex: Regex, target: str) -> Transform:
|
|||
groups.extend(match.groups())
|
||||
return PurePath(target.format(*groups))
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def re_rename(regex: Regex, target: str) -> Transform:
|
||||
def inner(path: PurePath) -> Optional[PurePath]:
|
||||
match = to_pattern(regex).fullmatch(path.name)
|
||||
|
|
@ -127,6 +148,7 @@ def re_rename(regex: Regex, target: str) -> Transform:
|
|||
groups.extend(match.groups())
|
||||
return path.with_name(target.format(*groups))
|
||||
return None
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
|
|
@ -136,7 +158,7 @@ def sanitize_windows_path(path: PurePath) -> PurePath:
|
|||
This method is a no-op on other operating systems.
|
||||
"""
|
||||
# Escape windows illegal path characters
|
||||
if os.name == 'nt':
|
||||
if os.name == "nt":
|
||||
sanitized_parts = [re.sub(r'[<>:"/|?]', "_", x) for x in list(path.parts)]
|
||||
return PurePath(*sanitized_parts)
|
||||
return path
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ def stream_to_path(
|
|||
else:
|
||||
settings = None
|
||||
|
||||
with open(target, 'wb') as file_descriptor:
|
||||
with open(target, "wb") as file_descriptor:
|
||||
with progress_for(settings) as progress:
|
||||
for chunk in response.iter_bytes():
|
||||
file_descriptor.write(chunk)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue