mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 23:45:05 +02:00
Move etag/mtime logic to more generic HTTP crawler
This commit is contained in:
parent
993d934825
commit
58548a1be9
2 changed files with 49 additions and 43 deletions
|
|
@ -1,8 +1,9 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import http.cookies
|
import http.cookies
|
||||||
import ssl
|
import ssl
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import certifi
|
import certifi
|
||||||
|
|
@ -15,6 +16,8 @@ from ..utils import fmt_real_path
|
||||||
from ..version import NAME, VERSION
|
from ..version import NAME, VERSION
|
||||||
from .crawler import Crawler, CrawlerSection
|
from .crawler import Crawler, CrawlerSection
|
||||||
|
|
||||||
|
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
|
||||||
|
|
||||||
|
|
||||||
class HttpCrawlerSection(CrawlerSection):
|
class HttpCrawlerSection(CrawlerSection):
|
||||||
def http_timeout(self) -> float:
|
def http_timeout(self) -> float:
|
||||||
|
|
@ -169,6 +172,50 @@ class HttpCrawler(Crawler):
|
||||||
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
|
log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
|
||||||
log.warn(str(e))
|
log.warn(str(e))
|
||||||
|
|
||||||
|
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
If available, retrieves the entity tag for a given path which was stored in the previous report.
|
||||||
|
"""
|
||||||
|
if not self._output_dir.prev_report:
|
||||||
|
return None
|
||||||
|
|
||||||
|
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
||||||
|
return etags.get(str(path))
|
||||||
|
|
||||||
|
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
|
||||||
|
"""
|
||||||
|
Adds an entity tag for a given path to the report's custom values.
|
||||||
|
"""
|
||||||
|
if not etag:
|
||||||
|
return
|
||||||
|
|
||||||
|
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
||||||
|
etags[str(path)] = etag
|
||||||
|
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
|
||||||
|
|
||||||
|
async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]:
|
||||||
|
"""
|
||||||
|
Requests the ETag and Last-Modified headers of a resource via a HEAD request.
|
||||||
|
If no entity tag / modification date can be obtained, the according value will be None.
|
||||||
|
"""
|
||||||
|
async with self.session.head(resource_url) as resp:
|
||||||
|
if resp.status != 200:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
etag_header = resp.headers.get("ETag")
|
||||||
|
last_modified_header = resp.headers.get("Last-Modified")
|
||||||
|
|
||||||
|
if last_modified_header:
|
||||||
|
try:
|
||||||
|
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
|
||||||
|
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
|
||||||
|
last_modified = datetime.strptime(last_modified_header, datetime_format)
|
||||||
|
except ValueError:
|
||||||
|
# last_modified remains None
|
||||||
|
pass
|
||||||
|
|
||||||
|
return etag_header, last_modified
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
self._request_count = 0
|
self._request_count = 0
|
||||||
self._cookie_jar = aiohttp.CookieJar()
|
self._cookie_jar = aiohttp.CookieJar()
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
@ -15,8 +14,6 @@ from ..utils import soupify
|
||||||
from .crawler import CrawlError
|
from .crawler import CrawlError
|
||||||
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
from .http_crawler import HttpCrawler, HttpCrawlerSection
|
||||||
|
|
||||||
ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags"
|
|
||||||
|
|
||||||
|
|
||||||
class KitIpdCrawlerSection(HttpCrawlerSection):
|
class KitIpdCrawlerSection(HttpCrawlerSection):
|
||||||
def target(self) -> str:
|
def target(self) -> str:
|
||||||
|
|
@ -95,7 +92,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
||||||
element_path = parent / file.name
|
element_path = parent / file.name
|
||||||
|
|
||||||
etag, mtime = await self._request_file_version(file)
|
etag, mtime = await self._request_resource_version(file.url)
|
||||||
prev_etag = self._get_previous_etag_from_report(element_path)
|
prev_etag = self._get_previous_etag_from_report(element_path)
|
||||||
etag_differs = None if prev_etag is None else prev_etag != etag
|
etag_differs = None if prev_etag is None else prev_etag != etag
|
||||||
|
|
||||||
|
|
@ -172,44 +169,6 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
self._add_etag_to_report(path, resp.headers.get("ETag"))
|
||||||
|
|
||||||
def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]:
|
|
||||||
if not self._output_dir.prev_report:
|
|
||||||
return None
|
|
||||||
|
|
||||||
etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
|
||||||
return etags.get(str(path))
|
|
||||||
|
|
||||||
def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None:
|
|
||||||
if not etag:
|
|
||||||
return
|
|
||||||
|
|
||||||
etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {}
|
|
||||||
etags[str(path)] = etag
|
|
||||||
self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags)
|
|
||||||
|
|
||||||
async def _request_file_version(self, file: KitIpdFile) -> Tuple[Optional[str], Optional[datetime]]:
|
|
||||||
"""
|
|
||||||
Request the ETag and Last-Modified headers of a file via a HEAD request.
|
|
||||||
If no etag / modification date can be obtained, the according value will be None.
|
|
||||||
"""
|
|
||||||
async with self.session.head(file.url) as resp:
|
|
||||||
if resp.status != 200:
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
etag_header = resp.headers.get("ETag")
|
|
||||||
last_modified_header = resp.headers.get("Last-Modified")
|
|
||||||
|
|
||||||
if last_modified_header:
|
|
||||||
try:
|
|
||||||
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives
|
|
||||||
datetime_format = "%a, %d %b %Y %H:%M:%S GMT"
|
|
||||||
last_modified = datetime.strptime(last_modified_header, datetime_format)
|
|
||||||
except ValueError:
|
|
||||||
# last_modified remains None
|
|
||||||
pass
|
|
||||||
|
|
||||||
return etag_header, last_modified
|
|
||||||
|
|
||||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue