mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 23:45:05 +02:00
Refactor persistence of entity tags
This commit is contained in:
parent
8ff2e198e8
commit
993d934825
3 changed files with 39 additions and 27 deletions
|
|
@ -294,7 +294,7 @@ class Crawler(ABC):
|
|||
self,
|
||||
path: PurePath,
|
||||
*,
|
||||
etag: Optional[str] = None,
|
||||
etag_differs: Optional[bool] = None,
|
||||
mtime: Optional[datetime] = None,
|
||||
redownload: Optional[Redownload] = None,
|
||||
on_conflict: Optional[OnConflict] = None,
|
||||
|
|
@ -312,7 +312,7 @@ class Crawler(ABC):
|
|||
fs_token = await self._output_dir.download(
|
||||
path,
|
||||
transformed_path,
|
||||
etag=etag,
|
||||
etag_differs=etag_differs,
|
||||
mtime=mtime,
|
||||
redownload=redownload,
|
||||
on_conflict=on_conflict
|
||||
|
|
|
|||
|
|
@ -10,11 +10,13 @@ from bs4 import BeautifulSoup, Tag
|
|||
|
||||
from ..config import Config
|
||||
from ..logging import ProgressBar, log
|
||||
from ..output_dir import ETAG_KEY_PATTERN, FileSink
|
||||
from ..output_dir import FileSink
|
||||
from ..utils import soupify
|
||||
from .crawler import CrawlError
|
||||
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
|
||||
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
|
||||
|
||||
|
||||
class KitIpdCrawlerSection(HttpCrawlerSection):
|
||||
def target(self) -> str:
|
||||
|
|
@ -92,15 +94,16 @@ class KitIpdCrawler(HttpCrawler):
|
|||
|
||||
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
||||
element_path = parent / file.name
|
||||
|
||||
etag, mtime = await self._request_file_version(file)
|
||||
maybe_dl = await self.download(element_path, etag=etag, mtime=mtime)
|
||||
prev_etag = self._get_previous_etag_from_report(element_path)
|
||||
etag_differs = None if prev_etag is None else prev_etag != etag
|
||||
|
||||
maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime)
|
||||
if not maybe_dl:
|
||||
# keep storing the known file's etag
|
||||
if self._output_dir.prev_report:
|
||||
etag_key = ETAG_KEY_PATTERN.format(element_path)
|
||||
prev_etag = self._output_dir.prev_report.get_custom_value(etag_key)
|
||||
if prev_etag:
|
||||
self._output_dir.report.add_custom_value(etag_key, prev_etag)
|
||||
if prev_etag:
|
||||
self._add_etag_to_report(element_path, prev_etag)
|
||||
return
|
||||
|
||||
async with maybe_dl as (bar, sink):
|
||||
|
|
@ -167,7 +170,22 @@ class KitIpdCrawler(HttpCrawler):
|
|||
|
||||
sink.done()
|
||||
|
||||
self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag"))
|
||||
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
||||
|
||||
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
|
||||
if not self._output_dir.prev_report:
|
||||
return None
|
||||
|
||||
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
||||
return etags.get(str(path))
|
||||
|
||||
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
|
||||
if not etag:
|
||||
return
|
||||
|
||||
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
||||
etags[str(path)] = etag
|
||||
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
|
||||
|
||||
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from dataclasses import dataclass
|
|||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path, PurePath
|
||||
from typing import BinaryIO, Iterator, Optional, Tuple
|
||||
from typing import BinaryIO, ClassVar, Iterator, Optional, Tuple
|
||||
|
||||
from .logging import log
|
||||
from .report import Report, ReportLoadError
|
||||
|
|
@ -18,7 +18,6 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_
|
|||
SUFFIX_CHARS = string.ascii_lowercase + string.digits
|
||||
SUFFIX_LENGTH = 6
|
||||
TRIES = 5
|
||||
ETAG_KEY_PATTERN = "etag-{}"
|
||||
|
||||
|
||||
class OutputDirError(Exception):
|
||||
|
|
@ -58,7 +57,7 @@ class OnConflict(Enum):
|
|||
|
||||
@dataclass
|
||||
class Heuristics:
|
||||
etag: Optional[str]
|
||||
etag_differs: Optional[bool]
|
||||
mtime: Optional[datetime]
|
||||
|
||||
|
||||
|
|
@ -195,7 +194,6 @@ class OutputDirectory:
|
|||
|
||||
def _should_download(
|
||||
self,
|
||||
path: PurePath,
|
||||
local_path: Path,
|
||||
heuristics: Heuristics,
|
||||
redownload: Redownload,
|
||||
|
|
@ -237,16 +235,12 @@ class OutputDirectory:
|
|||
remote_newer = None
|
||||
|
||||
# ETag should be a more reliable indicator than mtime, so we check it first
|
||||
if heuristics.etag:
|
||||
local_etag: Optional[str] = None
|
||||
if self.prev_report:
|
||||
local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path))
|
||||
if local_etag:
|
||||
remote_newer = local_etag != heuristics.etag
|
||||
if remote_newer:
|
||||
log.explain("Remote file's entity tag differs")
|
||||
else:
|
||||
log.explain("Remote file's entity tag is the same")
|
||||
if heuristics.etag_differs is not None:
|
||||
remote_newer = heuristics.etag_differs
|
||||
if remote_newer:
|
||||
log.explain("Remote file's entity tag differs")
|
||||
else:
|
||||
log.explain("Remote file's entity tag is the same")
|
||||
|
||||
# Python on Windows crashes when faced with timestamps around the unix epoch
|
||||
if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
|
||||
|
|
@ -382,7 +376,7 @@ class OutputDirectory:
|
|||
remote_path: PurePath,
|
||||
path: PurePath,
|
||||
*,
|
||||
etag: Optional[str] = None,
|
||||
etag_differs: Optional[bool] = None,
|
||||
mtime: Optional[datetime] = None,
|
||||
redownload: Optional[Redownload] = None,
|
||||
on_conflict: Optional[OnConflict] = None,
|
||||
|
|
@ -392,14 +386,14 @@ class OutputDirectory:
|
|||
MarkConflictError.
|
||||
"""
|
||||
|
||||
heuristics = Heuristics(etag, mtime)
|
||||
heuristics = Heuristics(etag_differs, mtime)
|
||||
redownload = self._redownload if redownload is None else redownload
|
||||
on_conflict = self._on_conflict if on_conflict is None else on_conflict
|
||||
local_path = self.resolve(path)
|
||||
|
||||
self._report.mark(path)
|
||||
|
||||
if not self._should_download(path, local_path, heuristics, redownload, on_conflict):
|
||||
if not self._should_download(local_path, heuristics, redownload, on_conflict):
|
||||
return None
|
||||
|
||||
# Detect and solve local-dir-remote-file conflict
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue