diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 44ec4dd..6e22c56 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,8 +1,9 @@ import asyncio import http.cookies import ssl +from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import aiohttp import certifi @@ -15,6 +16,8 @@ from ..utils import fmt_real_path from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection +ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" + class HttpCrawlerSection(CrawlerSection): def http_timeout(self) -> float: @@ -169,6 +172,50 @@ class HttpCrawler(Crawler): log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(str(e)) + def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: + """ + If available, retrieves the entity tag for a given path which was stored in the previous report. + """ + if not self._output_dir.prev_report: + return None + + etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + return etags.get(str(path)) + + def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: + """ + Adds an entity tag for a given path to the report's custom values. + """ + if not etag: + return + + etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + etags[str(path)] = etag + self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) + + async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: + """ + Requests the ETag and Last-Modified headers of a resource via a HEAD request. + If no entity tag / modification date can be obtained, the according value will be None. + """ + async with self.session.head(resource_url) as resp: + if resp.status != 200: + return None, None + + etag_header = resp.headers.get("ETag") + last_modified_header = resp.headers.get("Last-Modified") + + if last_modified_header: + try: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives + datetime_format = "%a, %d %b %Y %H:%M:%S GMT" + last_modified = datetime.strptime(last_modified_header, datetime_format) + except ValueError: + # last_modified remains None + pass + + return etag_header, last_modified + async def run(self) -> None: self._request_count = 0 self._cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index e7a15de..073ac67 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,7 +1,6 @@ import os import re from dataclasses import dataclass -from datetime import datetime from pathlib import PurePath from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin @@ -15,8 +14,6 @@ from ..utils import soupify from .crawler import CrawlError from .http_crawler import HttpCrawler, HttpCrawlerSection -ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" - class KitIpdCrawlerSection(HttpCrawlerSection): def target(self) -> str: @@ -95,7 +92,7 @@ class KitIpdCrawler(HttpCrawler): async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: element_path = parent / file.name - etag, mtime = await self._request_file_version(file) + etag, mtime = await self._request_resource_version(file.url) prev_etag = self._get_previous_etag_from_report(element_path) etag_differs = None if prev_etag is None else prev_etag != etag @@ -172,44 +169,6 @@ class KitIpdCrawler(HttpCrawler): self._add_etag_to_report(path, resp.headers.get("ETag")) - def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: - if not self._output_dir.prev_report: - return None - - etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} - return etags.get(str(path)) - - def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: - if not etag: - return - - etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} - etags[str(path)] = etag - self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) - - async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]: - """ - Request the ETag and Last-Modified headers of a file via a HEAD request. - If no etag / modification date can be obtained, the according value will be None. - """ - async with self.session.head(file.url) as resp: - if resp.status != 200: - return None, None - - etag_header = resp.headers.get("ETag") - last_modified_header = resp.headers.get("Last-Modified") - - if last_modified_header: - try: - # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives - datetime_format = "%a, %d %b %Y %H:%M:%S GMT" - last_modified = datetime.strptime(last_modified_header, datetime_format) - except ValueError: - # last_modified remains None - pass - - return etag_header, last_modified - async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some