mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
Examine ETag header to determine version of KIT-IPD files
This commit is contained in:
parent
c562c61caf
commit
2a59f76170
3 changed files with 54 additions and 18 deletions
|
|
@ -293,6 +293,8 @@ class Crawler(ABC):
|
|||
async def download(
|
||||
self,
|
||||
path: PurePath,
|
||||
*,
|
||||
etag: Optional[str] = None,
|
||||
mtime: Optional[datetime] = None,
|
||||
redownload: Optional[Redownload] = None,
|
||||
on_conflict: Optional[OnConflict] = None,
|
||||
|
|
@ -307,7 +309,14 @@ class Crawler(ABC):
|
|||
log.status("[bold bright_black]", "Ignored", fmt_path(path))
|
||||
return None
|
||||
|
||||
fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict)
|
||||
fs_token = await self._output_dir.download(
|
||||
path,
|
||||
transformed_path,
|
||||
etag=etag,
|
||||
mtime=mtime,
|
||||
redownload=redownload,
|
||||
on_conflict=on_conflict
|
||||
)
|
||||
if fs_token is None:
|
||||
log.explain("Answer: No")
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, Tag
|
|||
|
||||
from ..config import Config
|
||||
from ..logging import ProgressBar, log
|
||||
from ..output_dir import FileSink
|
||||
from ..output_dir import ETAG_KEY_PATTERN, FileSink
|
||||
from ..utils import soupify
|
||||
from .crawler import CrawlError
|
||||
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
||||
|
|
@ -92,13 +92,19 @@ class KitIpdCrawler(HttpCrawler):
|
|||
|
||||
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
||||
element_path = parent / file.name
|
||||
mtime = await self._request_last_modified(file)
|
||||
maybe_dl = await self.download(element_path, mtime=mtime)
|
||||
etag, mtime = await self._request_file_version(file)
|
||||
maybe_dl = await self.download(element_path, etag=etag, mtime=mtime)
|
||||
if not maybe_dl:
|
||||
# keep storing the known file's etag
|
||||
if self._output_dir.prev_report:
|
||||
etag_key = ETAG_KEY_PATTERN.format(element_path)
|
||||
prev_etag = self._output_dir.prev_report.get_custom_value(etag_key)
|
||||
if prev_etag:
|
||||
self._output_dir.report.add_custom_value(etag_key, prev_etag)
|
||||
return
|
||||
|
||||
async with maybe_dl as (bar, sink):
|
||||
await self._stream_from_url(file.url, sink, bar)
|
||||
await self._stream_from_url(file.url, element_path, sink, bar)
|
||||
|
||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||
page, url = await self.get_page()
|
||||
|
|
@ -148,7 +154,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||
def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
|
||||
return urljoin(url, link_tag.get("href"))
|
||||
|
||||
async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
|
||||
async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None:
|
||||
async with self.session.get(url, allow_redirects=False) as resp:
|
||||
if resp.status == 403:
|
||||
raise CrawlError("Received a 403. Are you within the KIT network/VPN?")
|
||||
|
|
@ -161,25 +167,29 @@ class KitIpdCrawler(HttpCrawler):
|
|||
|
||||
sink.done()
|
||||
|
||||
async def _request_last_modified(self, file: KitIpdFile) -> Optional[datetime]:
|
||||
self._output_dir.report.add_custom_value(ETAG_KEY_PATTERN.format(path), resp.headers.get("ETag"))
|
||||
|
||||
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
|
||||
"""
|
||||
Request the Last-Modified header of a file via a HEAD request.
|
||||
If no modification date can be obtained, return None.
|
||||
Request the ETag and Last-Modified headers of a file via a HEAD request.
|
||||
If no etag / modification date can be obtained, the according value will be None.
|
||||
"""
|
||||
async with self.session.head(file.url) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
return None, None
|
||||
|
||||
last_modified_header = resp.headers.get("Last-Modified")
|
||||
if not last_modified_header:
|
||||
return None
|
||||
etag = resp.headers.get("ETag")
|
||||
last_modified = resp.headers.get("Last-Modified")
|
||||
|
||||
try:
|
||||
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
|
||||
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
|
||||
return datetime.strptime(last_modified_header, datetime_format)
|
||||
last_modified = datetime.strptime(last_modified, datetime_format)
|
||||
except ValueError:
|
||||
return None
|
||||
# last_modified remains None
|
||||
pass
|
||||
|
||||
return etag, last_modified
|
||||
|
||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||
async with self.session.get(self._url) as request:
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ from .utils import ReusableAsyncContextManager, fmt_path, fmt_real_path, prompt_
|
|||
SUFFIX_CHARS = string.ascii_lowercase + string.digits
|
||||
SUFFIX_LENGTH = 6
|
||||
TRIES = 5
|
||||
ETAG_KEY_PATTERN = "etag-{}"
|
||||
|
||||
|
||||
class OutputDirError(Exception):
|
||||
|
|
@ -57,6 +58,7 @@ class OnConflict(Enum):
|
|||
|
||||
@dataclass
|
||||
class Heuristics:
|
||||
etag: Optional[str]
|
||||
mtime: Optional[datetime]
|
||||
|
||||
|
||||
|
|
@ -193,6 +195,7 @@ class OutputDirectory:
|
|||
|
||||
def _should_download(
|
||||
self,
|
||||
path: PurePath,
|
||||
local_path: Path,
|
||||
heuristics: Heuristics,
|
||||
redownload: Redownload,
|
||||
|
|
@ -233,8 +236,20 @@ class OutputDirectory:
|
|||
|
||||
remote_newer = None
|
||||
|
||||
# ETag should be a more reliable indicator than mtime, so we check it first
|
||||
if heuristics.etag:
|
||||
local_etag: Optional[str] = None
|
||||
if self.prev_report:
|
||||
local_etag = self.prev_report.get_custom_value(ETAG_KEY_PATTERN.format(path))
|
||||
if local_etag:
|
||||
remote_newer = local_etag != heuristics.etag
|
||||
if remote_newer:
|
||||
log.explain("Remote file's entity tag differs")
|
||||
else:
|
||||
log.explain("Remote file's entity tag is the same")
|
||||
|
||||
# Python on Windows crashes when faced with timestamps around the unix epoch
|
||||
if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
|
||||
if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970):
|
||||
mtime = heuristics.mtime
|
||||
remote_newer = mtime.timestamp() > stat.st_mtime
|
||||
if remote_newer:
|
||||
|
|
@ -366,6 +381,8 @@ class OutputDirectory:
|
|||
self,
|
||||
remote_path: PurePath,
|
||||
path: PurePath,
|
||||
*,
|
||||
etag: Optional[str] = None,
|
||||
mtime: Optional[datetime] = None,
|
||||
redownload: Optional[Redownload] = None,
|
||||
on_conflict: Optional[OnConflict] = None,
|
||||
|
|
@ -375,14 +392,14 @@ class OutputDirectory:
|
|||
MarkConflictError.
|
||||
"""
|
||||
|
||||
heuristics = Heuristics(mtime)
|
||||
heuristics = Heuristics(etag, mtime)
|
||||
redownload = self._redownload if redownload is None else redownload
|
||||
on_conflict = self._on_conflict if on_conflict is None else on_conflict
|
||||
local_path = self.resolve(path)
|
||||
|
||||
self._report.mark(path)
|
||||
|
||||
if not self._should_download(local_path, heuristics, redownload, on_conflict):
|
||||
if not self._should_download(path, local_path, heuristics, redownload, on_conflict):
|
||||
return None
|
||||
|
||||
# Detect and solve local-dir-remote-file conflict
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue