Refactor persistence of entity tags

This commit is contained in:
Scriptim 2024-10-27 12:21:51 +01:00
parent 8ff2e198e8
commit 993d934825
No known key found for this signature in database
GPG key ID: 1ABB18EA42CCAAF6
3 changed files with 39 additions and 27 deletions

View file

@ -294,7 +294,7 @@ class Crawler(ABC):
self, self,
path: PurePath, path: PurePath,
*, *,
etag: Optional[str] = None, etag_differs: Optional[bool] = None,
mtime: Optional[datetime] = None, mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None, redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None, on_conflict: Optional[OnConflict] = None,
@ -312,7 +312,7 @@ class Crawler(ABC):
fs_token = await self._output_dir.download( fs_token = await self._output_dir.download(
path, path,
transformed_path, transformed_path,
etag=etag, etag_differs=etag_differs,
mtime=mtime, mtime=mtime,
redownload=redownload, redownload=redownload,
on_conflict=on_conflict on_conflict=on_conflict

View file

@ -10,11 +10,13 @@ from bs4 import BeautifulSoup, Tag
from ..config import Config from ..config import Config
from ..logging import ProgressBar, log from ..logging import ProgressBar, log
from ..output_dir import ETAG_KEY_PATTERN, FileSink from ..output_dir import FileSink
from ..utils import soupify from ..utils import soupify
from .crawler import CrawlError from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection from .http_crawler import HttpCrawler, HttpCrawlerSection
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
class KitIpdCrawlerSection(HttpCrawlerSection): class KitIpdCrawlerSection(HttpCrawlerSection):
def target(self) -> str: def target(self) -> str:
@ -92,15 +94,16 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name element_path = parent / file.name
etag, mtime = await self._request_file_version(file) etag, mtime = await self._request_file_version(file)
maybe_dl = await self.download(element_path, etag=etag, mtime=mtime) prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag
maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime)
if not maybe_dl: if not maybe_dl:
# keep storing the known file's etag # keep storing the known file's etag
if self._output_dir.prev_report: if prev_etag:
etag_key = ETAG_KEY_PATTERN.format(element_path) self._add_etag_to_report(element_path, prev_etag)
prev_etag = self._output_dir.prev_report.get_custom_value(etag_key)
if prev_etag:
self._output_dir.report.add_custom_value(etag_key, prev_etag)
return return
async with maybe_dl as (bar, sink): async with maybe_dl as (bar, sink):
@ -167,7 +170,22 @@ class KitIpdCrawler(HttpCrawler):
sink.done() sink.done()
self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag")) self._add_etag_to_report(path, resp.headers.get("ETag"))
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
if not self._output_dir.prev_report:
return None
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
return etags.get(str(path))
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
if not etag:
return
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
etags[str(path)] = etag
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]: async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
""" """

View file

@ -9,7 +9,7 @@ from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import BinaryIO, Iterator, Optional, Tuple from typing import BinaryIO, ClassVar, Iterator, Optional, Tuple
from .logging import log from .logging import log
from .report import Report, ReportLoadError from .report import Report, ReportLoadError
@ -18,7 +18,6 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_
SUFFIX_CHARS = string.ascii_lowercase + string.digits SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6 SUFFIX_LENGTH = 6
TRIES = 5 TRIES = 5
ETAG_KEY_PATTERN = "etag-{}"
class OutputDirError(Exception): class OutputDirError(Exception):
@ -58,7 +57,7 @@ class OnConflict(Enum):
@dataclass @dataclass
class Heuristics: class Heuristics:
etag: Optional[str] etag_differs: Optional[bool]
mtime: Optional[datetime] mtime: Optional[datetime]
@ -195,7 +194,6 @@ class OutputDirectory:
def _should_download( def _should_download(
self, self,
path: PurePath,
local_path: Path, local_path: Path,
heuristics: Heuristics, heuristics: Heuristics,
redownload: Redownload, redownload: Redownload,
@ -237,16 +235,12 @@ class OutputDirectory:
remote_newer = None remote_newer = None
# ETag should be a more reliable indicator than mtime, so we check it first # ETag should be a more reliable indicator than mtime, so we check it first
if heuristics.etag: if heuristics.etag_differs is not None:
local_etag: Optional[str] = None remote_newer = heuristics.etag_differs
if self.prev_report: if remote_newer:
local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path)) log.explain("Remote file's entity tag differs")
if local_etag: else:
remote_newer = local_etag != heuristics.etag log.explain("Remote file's entity tag is the same")
if remote_newer:
log.explain("Remote file's entity tag differs")
else:
log.explain("Remote file's entity tag is the same")
# Python on Windows crashes when faced with timestamps around the unix epoch # Python on Windows crashes when faced with timestamps around the unix epoch
if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
@ -382,7 +376,7 @@ class OutputDirectory:
remote_path: PurePath, remote_path: PurePath,
path: PurePath, path: PurePath,
*, *,
etag: Optional[str] = None, etag_differs: Optional[bool] = None,
mtime: Optional[datetime] = None, mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None, redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None, on_conflict: Optional[OnConflict] = None,
@ -392,14 +386,14 @@ class OutputDirectory:
MarkConflictError. MarkConflictError.
""" """
heuristics = Heuristics(etag, mtime) heuristics = Heuristics(etag_differs, mtime)
redownload = self._redownload if redownload is None else redownload redownload = self._redownload if redownload is None else redownload
on_conflict = self._on_conflict if on_conflict is None else on_conflict on_conflict = self._on_conflict if on_conflict is None else on_conflict
local_path = self.resolve(path) local_path = self.resolve(path)
self._report.mark(path) self._report.mark(path)
if not self._should_download(path, local_path, heuristics, redownload, on_conflict): if not self._should_download(local_path, heuristics, redownload, on_conflict):
return None return None
# Detect and solve local-dir-remote-file conflict # Detect and solve local-dir-remote-file conflict