Refactor persistence of entity tags

This commit is contained in:
Scriptim 2024-10-27 12:21:51 +01:00
parent 8ff2e198e8
commit 993d934825
No known key found for this signature in database
GPG key ID: 1ABB18EA42CCAAF6
3 changed files with 39 additions and 27 deletions

View file

@ -294,7 +294,7 @@ class Crawler(ABC):
self,
path: PurePath,
*,
etag: Optional[str] = None,
etag_differs: Optional[bool] = None,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
@ -312,7 +312,7 @@ class Crawler(ABC):
fs_token = await self._output_dir.download(
path,
transformed_path,
etag=etag,
etag_differs=etag_differs,
mtime=mtime,
redownload=redownload,
on_conflict=on_conflict

View file

@ -10,11 +10,13 @@ from bs4 import BeautifulSoup, Tag
from ..config import Config
from ..logging import ProgressBar, log
from ..output_dir import ETAG_KEY_PATTERN, FileSink
from ..output_dir import FileSink
from ..utils import soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
class KitIpdCrawlerSection(HttpCrawlerSection):
def target(self) -> str:
@ -92,15 +94,16 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
etag, mtime = await self._request_file_version(file)
maybe_dl = await self.download(element_path, etag=etag, mtime=mtime)
prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag
maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime)
if not maybe_dl:
# keep storing the known file's etag
if self._output_dir.prev_report:
etag_key = ETAG_KEY_PATTERN.format(element_path)
prev_etag = self._output_dir.prev_report.get_custom_value(etag_key)
if prev_etag:
self._output_dir.report.add_custom_value(etag_key, prev_etag)
if prev_etag:
self._add_etag_to_report(element_path, prev_etag)
return
async with maybe_dl as (bar, sink):
@ -167,7 +170,22 @@ class KitIpdCrawler(HttpCrawler):
sink.done()
self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag"))
self._add_etag_to_report(path, resp.headers.get("ETag"))
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
if not self._output_dir.prev_report:
return None
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
return etags.get(str(path))
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
if not etag:
return
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
etags[str(path)] = etag
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
"""

View file

@ -9,7 +9,7 @@ from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path, PurePath
from typing import BinaryIO, Iterator, Optional, Tuple
from typing import BinaryIO, ClassVar, Iterator, Optional, Tuple
from .logging import log
from .report import Report, ReportLoadError
@ -18,7 +18,6 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_
SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6
TRIES = 5
ETAG_KEY_PATTERN = "etag-{}"
class OutputDirError(Exception):
@ -58,7 +57,7 @@ class OnConflict(Enum):
@dataclass
class Heuristics:
etag: Optional[str]
etag_differs: Optional[bool]
mtime: Optional[datetime]
@ -195,7 +194,6 @@ class OutputDirectory:
def _should_download(
self,
path: PurePath,
local_path: Path,
heuristics: Heuristics,
redownload: Redownload,
@ -237,16 +235,12 @@ class OutputDirectory:
remote_newer = None
# ETag should be a more reliable indicator than mtime, so we check it first
if heuristics.etag:
local_etag: Optional[str] = None
if self.prev_report:
local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path))
if local_etag:
remote_newer = local_etag != heuristics.etag
if remote_newer:
log.explain("Remote file's entity tag differs")
else:
log.explain("Remote file's entity tag is the same")
if heuristics.etag_differs is not None:
remote_newer = heuristics.etag_differs
if remote_newer:
log.explain("Remote file's entity tag differs")
else:
log.explain("Remote file's entity tag is the same")
# Python on Windows crashes when faced with timestamps around the unix epoch
if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
@ -382,7 +376,7 @@ class OutputDirectory:
remote_path: PurePath,
path: PurePath,
*,
etag: Optional[str] = None,
etag_differs: Optional[bool] = None,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
@ -392,14 +386,14 @@ class OutputDirectory:
MarkConflictError.
"""
heuristics = Heuristics(etag, mtime)
heuristics = Heuristics(etag_differs, mtime)
redownload = self._redownload if redownload is None else redownload
on_conflict = self._on_conflict if on_conflict is None else on_conflict
local_path = self.resolve(path)
self._report.mark(path)
if not self._should_download(path, local_path, heuristics, redownload, on_conflict):
if not self._should_download(local_path, heuristics, redownload, on_conflict):
return None
# Detect and solve local-dir-remote-file conflict