From 2a59f7617055af1758f728619b0386a9838f8df1 Mon Sep 17 00:00:00 2001 From: Scriptim Date: Fri, 25 Oct 2024 22:46:04 +0200 Subject: [PATCH] Examine ETag header to determine version of KIT-IPD files --- PFERD/crawl/crawler.py | 11 +++++++++- PFERD/crawl/kit_ipd_crawler.py | 38 +++++++++++++++++++++------------- PFERD/output_dir.py | 23 +++++++++++++++++--- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 0e67c02..904b38f 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -293,6 +293,8 @@ class Crawler(ABC): async def download( self, path: PurePath, + *, + etag: Optional[str] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -307,7 +309,14 @@ class Crawler(ABC): log.status("[bold bright_black]", "Ignored", fmt_path(path)) return None - fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) + fs_token = await self._output_dir.download( + path, + transformed_path, + etag=etag, + mtime=mtime, + redownload=redownload, + on_conflict=on_conflict + ) if fs_token is None: log.explain("Answer: No") return None diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 4bf305b..e46158e 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag from ..config import Config from ..logging import ProgressBar, log -from ..output_dir import FileSink +from ..output_dir import ETAG_KEY_PATTERN, FileSink from ..utils import soupify from .crawler import CrawlError from .http_crawler import HttpCrawler, HttpCrawlerSection @@ -92,13 +92,19 @@ class KitIpdCrawler(HttpCrawler): async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: element_path = parent / file.name - mtime = await self._request_last_modified(file) - maybe_dl = await self.download(element_path, mtime=mtime) + etag, mtime = await self._request_file_version(file) + maybe_dl = await self.download(element_path, etag=etag, mtime=mtime) if not maybe_dl: + # keep storing the known file's etag + if self._output_dir.prev_report: + etag_key = ETAG_KEY_PATTERN.format(element_path) + prev_etag = self._output_dir.prev_report.get_custom_value(etag_key) + if prev_etag: + self._output_dir.report.add_custom_value(etag_key, prev_etag) return async with maybe_dl as (bar, sink): - await self._stream_from_url(file.url, sink, bar) + await self._stream_from_url(file.url, element_path, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: page, url = await self.get_page() @@ -148,7 +154,7 @@ class KitIpdCrawler(HttpCrawler): def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: return urljoin(url, link_tag.get("href")) - async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: if resp.status == 403: raise CrawlError("Received a 403. Are you within the KIT network/VPN?") @@ -161,25 +167,29 @@ class KitIpdCrawler(HttpCrawler): sink.done() - async def _request_last_modified(self, file: KitIpdFile) -> Optional[datetime]: + self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag")) + + async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]: """ - Request the Last-Modified header of a file via a HEAD request. - If no modification date can be obtained, return None. + Request the ETag and Last-Modified headers of a file via a HEAD request. + If no etag / modification date can be obtained, the according value will be None. """ async with self.session.head(file.url) as resp: if resp.status != 200: - return None + return None, None - last_modified_header = resp.headers.get("Last-Modified") - if not last_modified_header: - return None + etag = resp.headers.get("ETag") + last_modified = resp.headers.get("Last-Modified") try: # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives datetime_format = "%a, %d %b %Y %H:%M:%S GMT" - return datetime.strptime(last_modified_header, datetime_format) + last_modified = datetime.strptime(last_modified, datetime_format) except ValueError: - return None + # last_modified remains None + pass + + return etag, last_modified async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index e9e9b93..72a44a5 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -18,6 +18,7 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_ SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_LENGTH = 6 TRIES = 5 +ETAG_KEY_PATTERN = "etag-{}" class OutputDirError(Exception): @@ -57,6 +58,7 @@ class OnConflict(Enum): @dataclass class Heuristics: + etag: Optional[str] mtime: Optional[datetime] @@ -193,6 +195,7 @@ class OutputDirectory: def _should_download( self, + path: PurePath, local_path: Path, heuristics: Heuristics, redownload: Redownload, @@ -233,8 +236,20 @@ class OutputDirectory: remote_newer = None + # ETag should be a more reliable indicator than mtime, so we check it first + if heuristics.etag: + local_etag: Optional[str] = None + if self.prev_report: + local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path)) + if local_etag: + remote_newer = local_etag != heuristics.etag + if remote_newer: + log.explain("Remote file's entity tag differs") + else: + log.explain("Remote file's entity tag is the same") + # Python on Windows crashes when faced with timestamps around the unix epoch - if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): + if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: @@ -366,6 +381,8 @@ class OutputDirectory: self, remote_path: PurePath, path: PurePath, + *, + etag: Optional[str] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -375,14 +392,14 @@ class OutputDirectory: MarkConflictError. """ - heuristics = Heuristics(mtime) + heuristics = Heuristics(etag, mtime) redownload = self._redownload if redownload is None else redownload on_conflict = self._on_conflict if on_conflict is None else on_conflict local_path = self.resolve(path) self._report.mark(path) - if not self._should_download(local_path, heuristics, redownload, on_conflict): + if not self._should_download(path, local_path, heuristics, redownload, on_conflict): return None # Detect and solve local-dir-remote-file conflict