Perform head requests sequentially

Real nice limiting is not easily possible with the current
crawl/download abstraction sadly.
This commit is contained in:
I-Al-Istannen 2024-10-27 17:30:00 +01:00
parent 2193adadb4
commit bb36571e42

View file

@ -1,6 +1,7 @@
import os import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime
from pathlib import PurePath from pathlib import PurePath
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
from urllib.parse import urljoin from urllib.parse import urljoin
@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler):
if isinstance(item, KitIpdFolder): if isinstance(item, KitIpdFolder):
tasks.append(self._crawl_folder(item)) tasks.append(self._crawl_folder(item))
else: else:
# do this here to at least be sequential and not parallel (rate limiting is hard, as the
# crawl abstraction does not hold for these requests)
etag, mtime = await self._request_resource_version(item.url)
# Orphan files are placed in the root folder # Orphan files are placed in the root folder
tasks.append(self._download_file(PurePath("."), item)) tasks.append(self._download_file(PurePath("."), item, etag, mtime))
await self.gather(tasks) await self.gather(tasks)
@ -85,14 +89,24 @@ class KitIpdCrawler(HttpCrawler):
if not await self.crawl(path): if not await self.crawl(path):
return return
tasks = [self._download_file(path, file) for file in folder.files] tasks = []
for file in folder.files:
# do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl
# abstraction does not hold for these requests)
etag, mtime = await self._request_resource_version(file.url)
tasks.append(self._download_file(path, file, etag, mtime))
await self.gather(tasks) await self.gather(tasks)
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: async def _download_file(
self,
parent: PurePath,
file: KitIpdFile,
etag: Optional[str],
mtime: Optional[datetime]
) -> None:
element_path = parent / file.name element_path = parent / file.name
etag, mtime = await self._request_resource_version(file.url)
prev_etag = self._get_previous_etag_from_report(element_path) prev_etag = self._get_previous_etag_from_report(element_path)
etag_differs = None if prev_etag is None else prev_etag != etag etag_differs = None if prev_etag is None else prev_etag != etag