Examine ETag header to determine version of KIT-IPD files

This commit is contained in:
Scriptim 2024-10-25 22:46:04 +02:00
parent c562c61caf
commit 2a59f76170
No known key found for this signature in database
GPG key ID: 1ABB18EA42CCAAF6
3 changed files with 54 additions and 18 deletions

View file

@ -293,6 +293,8 @@ class Crawler(ABC):
async def download(
self,
path: PurePath,
*,
etag: Optional[str] = None,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
@ -307,7 +309,14 @@ class Crawler(ABC):
log.status("[bold bright_black]", "Ignored", fmt_path(path))
return None
fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict)
fs_token = await self._output_dir.download(
path,
transformed_path,
etag=etag,
mtime=mtime,
redownload=redownload,
on_conflict=on_conflict
)
if fs_token is None:
log.explain("Answer: No")
return None

View file

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag
from ..config import Config
from ..logging import ProgressBar, log
from ..output_dir import FileSink
from ..output_dir import ETAG_KEY_PATTERN, FileSink
from ..utils import soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
@ -92,13 +92,19 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
mtime = await self._request_last_modified(file)
maybe_dl = await self.download(element_path, mtime=mtime)
etag, mtime = await self._request_file_version(file)
maybe_dl = await self.download(element_path, etag=etag, mtime=mtime)
if not maybe_dl:
# keep storing the known file's etag
if self._output_dir.prev_report:
etag_key = ETAG_KEY_PATTERN.format(element_path)
prev_etag = self._output_dir.prev_report.get_custom_value(etag_key)
if prev_etag:
self._output_dir.report.add_custom_value(etag_key, prev_etag)
return
async with maybe_dl as (bar, sink):
await self._stream_from_url(file.url, sink, bar)
await self._stream_from_url(file.url, element_path, sink, bar)
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
page, url = await self.get_page()
@ -148,7 +154,7 @@ class KitIpdCrawler(HttpCrawler):
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
return urljoin(url, link_tag.get("href"))
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
async with self.session.get(url, allow_redirects=False) as resp:
if resp.status == 403:
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
@ -161,25 +167,29 @@ class KitIpdCrawler(HttpCrawler):
sink.done()
async def _request_last_modified(self, file: KitIpdFile) -> Optional[datetime]:
self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag"))
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
"""
Request the Last-Modified header of a file via a HEAD request.
If no modification date can be obtained, return None.
Request the ETag and Last-Modified headers of a file via a HEAD request.
If no etag / modification date can be obtained, the according value will be None.
"""
async with self.session.head(file.url) as resp:
if resp.status != 200:
return None
return None, None
last_modified_header = resp.headers.get("Last-Modified")
if not last_modified_header:
return None
etag = resp.headers.get("ETag")
last_modified = resp.headers.get("Last-Modified")
try:
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
return datetime.strptime(last_modified_header, datetime_format)
last_modified = datetime.strptime(last_modified, datetime_format)
except ValueError:
return None
# last_modified remains None
pass
return etag, last_modified
async def get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request:

View file

@ -18,6 +18,7 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_
SUFFIX_CHARS = string.ascii_lowercase + string.digits
SUFFIX_LENGTH = 6
TRIES = 5
ETAG_KEY_PATTERN = "etag-{}"
class OutputDirError(Exception):
@ -57,6 +58,7 @@ class OnConflict(Enum):
@dataclass
class Heuristics:
etag: Optional[str]
mtime: Optional[datetime]
@ -193,6 +195,7 @@ class OutputDirectory:
def _should_download(
self,
path: PurePath,
local_path: Path,
heuristics: Heuristics,
redownload: Redownload,
@ -233,8 +236,20 @@ class OutputDirectory:
remote_newer = None
# ETag should be a more reliable indicator than mtime, so we check it first
if heuristics.etag:
local_etag: Optional[str] = None
if self.prev_report:
local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path))
if local_etag:
remote_newer = local_etag != heuristics.etag
if remote_newer:
log.explain("Remote file's entity tag differs")
else:
log.explain("Remote file's entity tag is the same")
# Python on Windows crashes when faced with timestamps around the unix epoch
if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
mtime = heuristics.mtime
remote_newer = mtime.timestamp() > stat.st_mtime
if remote_newer:
@ -366,6 +381,8 @@ class OutputDirectory:
self,
remote_path: PurePath,
path: PurePath,
*,
etag: Optional[str] = None,
mtime: Optional[datetime] = None,
redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None,
@ -375,14 +392,14 @@ class OutputDirectory:
MarkConflictError.
"""
heuristics = Heuristics(mtime)
heuristics = Heuristics(etag, mtime)
redownload = self._redownload if redownload is None else redownload
on_conflict = self._on_conflict if on_conflict is None else on_conflict
local_path = self.resolve(path)
self._report.mark(path)
if not self._should_download(local_path, heuristics, redownload, on_conflict):
if not self._should_download(path, local_path, heuristics, redownload, on_conflict):
return None
# Detect and solve local-dir-remote-file conflict