Move etag/mtime logic to more generic HTTP crawler

This commit is contained in:
Scriptim 2024-10-27 12:36:08 +01:00
parent 993d934825
commit 58548a1be9
No known key found for this signature in database
GPG key ID: 1ABB18EA42CCAAF6
2 changed files with 49 additions and 43 deletions

View file

@ -1,8 +1,9 @@
import asyncio
import http.cookies
import ssl
from datetime import datetime
from pathlib import Path, PurePath
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import aiohttp
import certifi
@ -15,6 +16,8 @@ from ..utils import fmt_real_path
from ..version import NAME, VERSION
from .crawler import Crawler, CrawlerSection
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
class HttpCrawlerSection(CrawlerSection):
def http_timeout(self) -> float:
@ -169,6 +172,50 @@ class HttpCrawler(Crawler):
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
log.warn(str(e))
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
"""
If available, retrieves the entity tag for a given path which was stored in the previous report.
"""
if not self._output_dir.prev_report:
return None
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
return etags.get(str(path))
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
"""
Adds an entity tag for a given path to the report's custom values.
"""
if not etag:
return
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
etags[str(path)] = etag
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]:
"""
Requests the ETag and Last-Modified headers of a resource via a HEAD request.
If no entity tag / modification date can be obtained, the according value will be None.
"""
async with self.session.head(resource_url) as resp:
if resp.status != 200:
return None, None
etag_header = resp.headers.get("ETag")
last_modified_header = resp.headers.get("Last-Modified")
if last_modified_header:
try:
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
last_modified = datetime.strptime(last_modified_header, datetime_format)
except ValueError:
# last_modified remains None
pass
return etag_header, last_modified
async def run(self) -> None:
self._request_count = 0
self._cookie_jar = aiohttp.CookieJar()

View file

@ -1,7 +1,6 @@
import os
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import PurePath
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
from urllib.parse import urljoin
@ -15,8 +14,6 @@ from ..utils import soupify
from .crawler import CrawlError
from .http_crawler import HttpCrawler, HttpCrawlerSection
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
class KitIpdCrawlerSection(HttpCrawlerSection):
def target(self) -> str:
@ -95,7 +92,7 @@ class KitIpdCrawler(HttpCrawler):
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
element_path = parent / file.name
etag, mtime = await self._request_file_version(file)
etag, mtime = await self._request_resource_version(file.url)
prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag
@ -172,44 +169,6 @@ class KitIpdCrawler(HttpCrawler):
self._add_etag_to_report(path, resp.headers.get("ETag"))
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
if not self._output_dir.prev_report:
return None
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
return etags.get(str(path))
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
if not etag:
return
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
etags[str(path)] = etag
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
"""
Request the ETag and Last-Modified headers of a file via a HEAD request.
If no etag / modification date can be obtained, the according value will be None.
"""
async with self.session.head(file.url) as resp:
if resp.status != 200:
return None, None
etag_header = resp.headers.get("ETag")
last_modified_header = resp.headers.get("Last-Modified")
if last_modified_header:
try:
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
last_modified = datetime.strptime(last_modified_header, datetime_format)
except ValueError:
# last_modified remains None
pass
return etag_header, last_modified
async def get_page(self) -> Tuple[BeautifulSoup, str]:
async with self.session.get(self._url) as request:
# The web page for Algorithmen für Routenplanung contains some