From c687d4a51a27bb2121293282a4640c91c5a4ac14 Mon Sep 17 00:00:00 2001
From: Joscha <joscha@plugh.de>
Date: Mon, 24 May 2021 13:10:19 +0200
Subject: [PATCH] Implement cookie sharing

---
 CONFIG.md                                  |  5 ++
 PFERD/cli/parser.py                        |  9 ++-
 PFERD/config.py                            |  3 +
 PFERD/crawl/http_crawler.py                | 80 +++++++++++++++++-----
 PFERD/crawl/ilias/kit_ilias_web_crawler.py |  9 ++-
 PFERD/pferd.py                             | 10 ++-
 6 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/CONFIG.md b/CONFIG.md
index dcc7421..7e8a717 100644
--- a/CONFIG.md
+++ b/CONFIG.md
@@ -25,6 +25,11 @@ default values for the other sections.
    `yes`)
 - `report`: Whether PFERD should print a report of added, changed and deleted
    local files for all crawlers before exiting. (Default: `yes`)
+- `share_cookies`: Whether crawlers should share cookies where applicable. By
+  default, crawlers are isolated and don't interact with each other. This
+  includes their cookies. However, in situations where multiple crawlers crawl
+  the same website using the same account, sharing cookies between crawlers can
+  make sense. (Default: `yes`)
 
 ## The `crawl:*` sections
 
diff --git a/PFERD/cli/parser.py b/PFERD/cli/parser.py
index 346070f..72abb76 100644
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@@ -169,6 +169,11 @@ PARSER.add_argument(
     action=BooleanOptionalAction,
     help="print a report of all local changes before exiting"
 )
+PARSER.add_argument(
+    "--share-cookies",
+    action=BooleanOptionalAction,
+    help="whether crawlers should share cookies where applicable"
+)
 
 
 def load_default_section(
@@ -180,7 +185,9 @@ def load_default_section(
     if args.working_dir is not None:
         section["working_dir"] = str(args.working_dir)
     if args.explain is not None:
-        section["explain"] = "true" if args.explain else "false"
+        section["explain"] = "yes" if args.explain else "no"
+    if args.share_cookies is not None:
+        section["share_cookies"] = "yes" if args.share_cookies else "no"
 
 
 SUBPARSERS = PARSER.add_subparsers(title="crawlers")
diff --git a/PFERD/config.py b/PFERD/config.py
index 0c99683..abd6e9e 100644
--- a/PFERD/config.py
+++ b/PFERD/config.py
@@ -81,6 +81,9 @@ class DefaultSection(Section):
     def report(self) -> bool:
         return self.s.getboolean("report", fallback=True)
 
+    def share_cookies(self) -> bool:
+        return self.s.getboolean("share_cookies", fallback=True)
+
 
 class Config:
     @staticmethod
diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py
index 8cd6afe..facc2ba 100644
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -1,10 +1,11 @@
 import asyncio
-from pathlib import PurePath
-from typing import Optional
+from pathlib import Path, PurePath
+from typing import Dict, List, Optional
 
 import aiohttp
 from aiohttp.client import ClientTimeout
 
+from ..auth import Authenticator
 from ..config import Config
 from ..logging import log
 from ..utils import fmt_real_path
@@ -25,17 +26,22 @@ class HttpCrawler(Crawler):
             name: str,
             section: HttpCrawlerSection,
             config: Config,
+            shared_auth: Optional[Authenticator] = None,
     ) -> None:
         super().__init__(name, section, config)
 
-        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
-        self._output_dir.register_reserved(self.COOKIE_FILE)
         self._authentication_id = 0
         self._authentication_lock = asyncio.Lock()
-        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
         self._request_count = 0
         self._http_timeout = section.http_timeout()
 
+        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
+        self._shared_cookie_jar_paths: Optional[List[Path]] = None
+        self._shared_auth = shared_auth
+        self._current_cookie_jar: Optional[aiohttp.CookieJar] = None
+
+        self._output_dir.register_reserved(self.COOKIE_FILE)
+
     async def _current_auth_id(self) -> int:
         """
         Returns the id for the current authentication, i.e. an identifier for the last
@@ -71,7 +77,7 @@ class HttpCrawler(Crawler):
             self._authentication_id += 1
             # Saving the cookies after the first auth ensures we won't need to re-authenticate
             # on the next run, should this one be aborted or crash
-            await self._save_cookies()
+            self._save_cookies()
 
     async def _authenticate(self) -> None:
         """
@@ -80,26 +86,68 @@ class HttpCrawler(Crawler):
         """
         raise RuntimeError("_authenticate() was called but crawler doesn't provide an implementation")
 
-    async def _save_cookies(self) -> None:
+    def share_cookies(self, shared: Dict[Authenticator, List[Path]]) -> None:
+        if not self._shared_auth:
+            return
+
+        if self._shared_auth in shared:
+            self._shared_cookie_jar_paths = shared[self._shared_auth]
+        else:
+            self._shared_cookie_jar_paths = []
+            shared[self._shared_auth] = self._shared_cookie_jar_paths
+
+        self._shared_cookie_jar_paths.append(self._cookie_jar_path)
+
+    def _load_cookies(self) -> None:
+        log.explain_topic("Loading cookies")
+        cookie_jar_path: Optional[Path] = None
+
+        if self._shared_cookie_jar_paths is None:
+            log.explain("Not sharing any cookies")
+            cookie_jar_path = self._cookie_jar_path
+        else:
+            log.explain("Sharing cookies")
+            max_mtime: Optional[float] = None
+            for path in self._shared_cookie_jar_paths:
+                if not path.is_file():
+                    log.explain(f"{fmt_real_path(path)} is not a file")
+                    continue
+                mtime = path.stat().st_mtime
+                if max_mtime is None or mtime > max_mtime:
+                    log.explain(f"{fmt_real_path(path)} has newest mtime so far")
+                    max_mtime = mtime
+                    cookie_jar_path = path
+                else:
+                    log.explain(f"{fmt_real_path(path)} has older mtime")
+
+        if cookie_jar_path is None:
+            log.explain("Couldn't find a suitable cookie file")
+            return
+
+        log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
+        try:
+            self._current_cookie_jar = aiohttp.CookieJar()
+            self._current_cookie_jar.load(cookie_jar_path)
+        except Exception as e:
+            log.explain("Failed to load cookies")
+            log.explain(str(e))
+
+    def _save_cookies(self) -> None:
         log.explain_topic("Saving cookies")
         if not self._current_cookie_jar:
             log.explain("No cookie jar, save aborted")
             return
 
         try:
+            log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
             self._current_cookie_jar.save(self._cookie_jar_path)
-            log.explain(f"Cookies saved to {fmt_real_path(self._cookie_jar_path)}")
-        except Exception:
+        except Exception as e:
             log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
+            log.warn(str(e))
 
     async def run(self) -> None:
-        self._current_cookie_jar = aiohttp.CookieJar()
         self._request_count = 0
-
-        try:
-            self._current_cookie_jar.load(self._cookie_jar_path)
-        except Exception:
-            pass
+        self._load_cookies()
 
         async with aiohttp.ClientSession(
                 headers={"User-Agent": f"{NAME}/{VERSION}"},
@@ -114,4 +162,4 @@ class HttpCrawler(Crawler):
         log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
 
         # They are saved in authenticate, but a final save won't hurt
-        await self._save_cookies()
+        self._save_cookies()
diff --git a/PFERD/crawl/ilias/kit_ilias_web_crawler.py b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
index 222e1d6..d488974 100644
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -152,12 +152,15 @@ class KitIliasWebCrawler(HttpCrawler):
             config: Config,
             authenticators: Dict[str, Authenticator]
     ):
-        super().__init__(name, section, config)
+        # Setting a main authenticator for cookie sharing
+        auth = section.auth(authenticators)
+        super().__init__(name, section, config, shared_auth=auth)
 
         self._shibboleth_login = KitShibbolethLogin(
-            section.auth(authenticators),
-            section.tfa_auth(authenticators)
+            auth,
+            section.tfa_auth(authenticators),
         )
+
         self._base_url = "https://ilias.studium.kit.edu"
 
         self._target = section.target()
diff --git a/PFERD/pferd.py b/PFERD/pferd.py
index 2b9921e..35f5194 100644
--- a/PFERD/pferd.py
+++ b/PFERD/pferd.py
@@ -1,10 +1,11 @@
+from pathlib import Path
 from typing import Dict, List, Optional
 
 from rich.markup import escape
 
 from .auth import AUTHENTICATORS, Authenticator
 from .config import Config, ConfigOptionError
-from .crawl import CRAWLERS, Crawler, CrawlError
+from .crawl import CRAWLERS, Crawler, CrawlError, KitIliasWebCrawler
 from .logging import log
 from .utils import fmt_path
 
@@ -42,6 +43,9 @@ class Pferd:
     def _load_crawlers(self) -> List[str]:
         names = []
 
+        # Cookie sharing
+        kit_ilias_web_paths: Dict[Authenticator, List[Path]] = {}
+
         for name, section in self._config.crawler_sections():
             log.print(f"[bold bright_cyan]Loading[/] {escape(name)}")
             names.append(name)
@@ -54,6 +58,10 @@ class Pferd:
             crawler = crawler_constructor(name, section, self._config, self._authenticators)
             self._crawlers[name] = crawler
 
+            if self._config.default_section.share_cookies():
+                if isinstance(crawler, KitIliasWebCrawler):
+                    crawler.share_cookies(kit_ilias_web_paths)
+
         return names
 
     def _find_crawlers_to_run(self, loaded_crawlers: List[str]) -> List[str]: