From c687d4a51a27bb2121293282a4640c91c5a4ac14 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 24 May 2021 13:10:19 +0200 Subject: [PATCH] Implement cookie sharing --- CONFIG.md | 5 ++ PFERD/cli/parser.py | 9 ++- PFERD/config.py | 3 + PFERD/crawl/http_crawler.py | 80 +++++++++++++++++----- PFERD/crawl/ilias/kit_ilias_web_crawler.py | 9 ++- PFERD/pferd.py | 10 ++- 6 files changed, 95 insertions(+), 21 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index dcc7421..7e8a717 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -25,6 +25,11 @@ default values for the other sections. `yes`) - `report`: Whether PFERD should print a report of added, changed and deleted local files for all crawlers before exiting. (Default: `yes`) +- `share_cookies`: Whether crawlers should share cookies where applicable. By + default, crawlers are isolated and don't interact with each other. This + includes their cookies. However, in situations where multiple crawlers crawl + the same website using the same account, sharing cookies between crawlers can + make sense. (Default: `yes`) ## The `crawl:*` sections diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py index 346070f..72abb76 100644 --- a/PFERD/cli/parser.py +++ b/PFERD/cli/parser.py @@ -169,6 +169,11 @@ PARSER.add_argument( action=BooleanOptionalAction, help="print a report of all local changes before exiting" ) +PARSER.add_argument( + "--share-cookies", + action=BooleanOptionalAction, + help="whether crawlers should share cookies where applicable" +) def load_default_section( @@ -180,7 +185,9 @@ def load_default_section( if args.working_dir is not None: section["working_dir"] = str(args.working_dir) if args.explain is not None: - section["explain"] = "true" if args.explain else "false" + section["explain"] = "yes" if args.explain else "no" + if args.share_cookies is not None: + section["share_cookies"] = "yes" if args.share_cookies else "no" SUBPARSERS = PARSER.add_subparsers(title="crawlers") diff --git a/PFERD/config.py b/PFERD/config.py index 0c99683..abd6e9e 100644 --- a/PFERD/config.py +++ b/PFERD/config.py @@ -81,6 +81,9 @@ class DefaultSection(Section): def report(self) -> bool: return self.s.getboolean("report", fallback=True) + def share_cookies(self) -> bool: + return self.s.getboolean("share_cookies", fallback=True) + class Config: @staticmethod diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 8cd6afe..facc2ba 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,10 +1,11 @@ import asyncio -from pathlib import PurePath -from typing import Optional +from pathlib import Path, PurePath +from typing import Dict, List, Optional import aiohttp from aiohttp.client import ClientTimeout +from ..auth import Authenticator from ..config import Config from ..logging import log from ..utils import fmt_real_path @@ -25,17 +26,22 @@ class HttpCrawler(Crawler): name: str, section: HttpCrawlerSection, config: Config, + shared_auth: Optional[Authenticator] = None, ) -> None: super().__init__(name, section, config) - self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) - self._output_dir.register_reserved(self.COOKIE_FILE) self._authentication_id = 0 self._authentication_lock = asyncio.Lock() - self._current_cookie_jar: Optional[aiohttp.CookieJar] = None self._request_count = 0 self._http_timeout = section.http_timeout() + self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE) + self._shared_cookie_jar_paths: Optional[List[Path]] = None + self._shared_auth = shared_auth + self._current_cookie_jar: Optional[aiohttp.CookieJar] = None + + self._output_dir.register_reserved(self.COOKIE_FILE) + async def _current_auth_id(self) -> int: """ Returns the id for the current authentication, i.e. an identifier for the last @@ -71,7 +77,7 @@ class HttpCrawler(Crawler): self._authentication_id += 1 # Saving the cookies after the first auth ensures we won't need to re-authenticate # on the next run, should this one be aborted or crash - await self._save_cookies() + self._save_cookies() async def _authenticate(self) -> None: """ @@ -80,26 +86,68 @@ class HttpCrawler(Crawler): """ raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation") - async def _save_cookies(self) -> None: + def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None: + if not self._shared_auth: + return + + if self._shared_auth in shared: + self._shared_cookie_jar_paths = shared[self._shared_auth] + else: + self._shared_cookie_jar_paths = [] + shared[self._shared_auth] = self._shared_cookie_jar_paths + + self._shared_cookie_jar_paths.append(self._cookie_jar_path) + + def _load_cookies(self) -> None: + log.explain_topic("Loading cookies") + cookie_jar_path: Optional[Path] = None + + if self._shared_cookie_jar_paths is None: + log.explain("Not sharing any cookies") + cookie_jar_path = self._cookie_jar_path + else: + log.explain("Sharing cookies") + max_mtime: Optional[float] = None + for path in self._shared_cookie_jar_paths: + if not path.is_file(): + log.explain(f"{fmt_real_path(path)} is not a file") + continue + mtime = path.stat().st_mtime + if max_mtime is None or mtime > max_mtime: + log.explain(f"{fmt_real_path(path)} has newest mtime so far") + max_mtime = mtime + cookie_jar_path = path + else: + log.explain(f"{fmt_real_path(path)} has older mtime") + + if cookie_jar_path is None: + log.explain("Couldn't find a suitable cookie file") + return + + log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}") + try: + self._current_cookie_jar = aiohttp.CookieJar() + self._current_cookie_jar.load(cookie_jar_path) + except Exception as e: + log.explain("Failed to load cookies") + log.explain(str(e)) + + def _save_cookies(self) -> None: log.explain_topic("Saving cookies") if not self._current_cookie_jar: log.explain("No cookie jar, save aborted") return try: + log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}") self._current_cookie_jar.save(self._cookie_jar_path) - log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}") - except Exception: + except Exception as e: log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") + log.warn(str(e)) async def run(self) -> None: - self._current_cookie_jar = aiohttp.CookieJar() self._request_count = 0 - - try: - self._current_cookie_jar.load(self._cookie_jar_path) - except Exception: - pass + self._load_cookies() async with aiohttp.ClientSession( headers={"User-Agent": f"{NAME}/{VERSION}"}, @@ -114,4 +162,4 @@ class HttpCrawler(Crawler): log.explain_topic(f"Total amount of HTTP requests: {self._request_count}") # They are saved in authenticate, but a final save won't hurt - await self._save_cookies() + self._save_cookies() diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py index 222e1d6..d488974 100644 --- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py +++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py @@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler): config: Config, authenticators: Dict[str, Authenticator] ): - super().__init__(name, section, config) + # Setting a main authenticator for cookie sharing + auth = section.auth(authenticators) + super().__init__(name, section, config, shared_auth=auth) self._shibboleth_login = KitShibbolethLogin( - section.auth(authenticators), - section.tfa_auth(authenticators) + auth, + section.tfa_auth(authenticators), ) + self._base_url = "https://ilias.studium.kit.edu" self._target = section.target() diff --git a/PFERD/pferd.py b/PFERD/pferd.py index 2b9921e..35f5194 100644 --- a/PFERD/pferd.py +++ b/PFERD/pferd.py @@ -1,10 +1,11 @@ +from pathlib import Path from typing import Dict, List, Optional from rich.markup import escape from .auth import AUTHENTICATORS, Authenticator from .config import Config, ConfigOptionError -from .crawl import CRAWLERS, Crawler, CrawlError +from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler from .logging import log from .utils import fmt_path @@ -42,6 +43,9 @@ class Pferd: def _load_crawlers(self) -> List[str]: names = [] + # Cookie sharing + kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {} + for name, section in self._config.crawler_sections(): log.print(f"[bold bright_cyan]Loading[/] {escape(name)}") names.append(name) @@ -54,6 +58,10 @@ class Pferd: crawler = crawler_constructor(name, section, self._config, self._authenticators) self._crawlers[name] = crawler + if self._config.default_section.share_cookies(): + if isinstance(crawler, KitIliasWebCrawler): + crawler.share_cookies(kit_ilias_web_paths) + return names def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: