diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 904b38f..dd500e6 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -294,7 +294,7 @@ class Crawler(ABC): self, path: PurePath, *, - etag: Optional[str] = None, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -312,7 +312,7 @@ class Crawler(ABC): fs_token = await self._output_dir.download( path, transformed_path, - etag=etag, + etag_differs=etag_differs, mtime=mtime, redownload=redownload, on_conflict=on_conflict diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 15f73a8..e7a15de 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -10,11 +10,13 @@ from bs4 import BeautifulSoup, Tag from ..config import Config from ..logging import ProgressBar, log -from ..output_dir import ETAG_KEY_PATTERN, FileSink +from ..output_dir import FileSink from ..utils import soupify from .crawler import CrawlError from .http_crawler import HttpCrawler, HttpCrawlerSection +ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" + class KitIpdCrawlerSection(HttpCrawlerSection): def target(self) -> str: @@ -92,15 +94,16 @@ class KitIpdCrawler(HttpCrawler): async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: element_path = parent / file.name + etag, mtime = await self._request_file_version(file) - maybe_dl = await self.download(element_path, etag=etag, mtime=mtime) + prev_etag = self._get_previous_etag_from_report(element_path) + etag_differs = None if prev_etag is None else prev_etag != etag + + maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) if not maybe_dl: # keep storing the known file's etag - if self._output_dir.prev_report: - etag_key = ETAG_KEY_PATTERN.format(element_path) - prev_etag = self._output_dir.prev_report.get_custom_value(etag_key) - if prev_etag: - self._output_dir.report.add_custom_value(etag_key, prev_etag) + if prev_etag: + self._add_etag_to_report(element_path, prev_etag) return async with maybe_dl as (bar, sink): @@ -167,7 +170,22 @@ class KitIpdCrawler(HttpCrawler): sink.done() - self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag")) + self._add_etag_to_report(path, resp.headers.get("ETag")) + + def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: + if not self._output_dir.prev_report: + return None + + etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + return etags.get(str(path)) + + def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: + if not etag: + return + + etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + etags[str(path)] = etag + self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]: """ diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index 72a44a5..a0c5a1a 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum from pathlib import Path, PurePath -from typing import BinaryIO, Iterator, Optional, Tuple +from typing import BinaryIO, ClassVar, Iterator, Optional, Tuple from .logging import log from .report import Report, ReportLoadError @@ -18,7 +18,6 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_ SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_LENGTH = 6 TRIES = 5 -ETAG_KEY_PATTERN = "etag-{}" class OutputDirError(Exception): @@ -58,7 +57,7 @@ class OnConflict(Enum): @dataclass class Heuristics: - etag: Optional[str] + etag_differs: Optional[bool] mtime: Optional[datetime] @@ -195,7 +194,6 @@ class OutputDirectory: def _should_download( self, - path: PurePath, local_path: Path, heuristics: Heuristics, redownload: Redownload, @@ -237,16 +235,12 @@ class OutputDirectory: remote_newer = None # ETag should be a more reliable indicator than mtime, so we check it first - if heuristics.etag: - local_etag: Optional[str] = None - if self.prev_report: - local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path)) - if local_etag: - remote_newer = local_etag != heuristics.etag - if remote_newer: - log.explain("Remote file's entity tag differs") - else: - log.explain("Remote file's entity tag is the same") + if heuristics.etag_differs is not None: + remote_newer = heuristics.etag_differs + if remote_newer: + log.explain("Remote file's entity tag differs") + else: + log.explain("Remote file's entity tag is the same") # Python on Windows crashes when faced with timestamps around the unix epoch if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): @@ -382,7 +376,7 @@ class OutputDirectory: remote_path: PurePath, path: PurePath, *, - etag: Optional[str] = None, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -392,14 +386,14 @@ class OutputDirectory: MarkConflictError. """ - heuristics = Heuristics(etag, mtime) + heuristics = Heuristics(etag_differs, mtime) redownload = self._redownload if redownload is None else redownload on_conflict = self._on_conflict if on_conflict is None else on_conflict local_path = self.resolve(path) self._report.mark(path) - if not self._should_download(path, local_path, heuristics, redownload, on_conflict): + if not self._should_download(local_path, heuristics, redownload, on_conflict): return None # Detect and solve local-dir-remote-file conflict