From bb36571e421283ac24bc647d7d066631ff008f51 Mon Sep 17 00:00:00 2001 From: I-Al-Istannen Date: Sun, 27 Oct 2024 17:30:00 +0100 Subject: [PATCH] Perform head requests sequentially Real nice limiting is not easily possible with the current crawl/download abstraction sadly. --- PFERD/crawl/kit_ipd_crawler.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 073ac67..d9515e2 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,6 +1,7 @@ import os import re from dataclasses import dataclass +from datetime import datetime from pathlib import PurePath from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin @@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler): if isinstance(item, KitIpdFolder): tasks.append(self._crawl_folder(item)) else: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the + # crawl abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(item.url) # Orphan files are placed in the root folder - tasks.append(self._download_file(PurePath("."), item)) + tasks.append(self._download_file(PurePath("."), item, etag, mtime)) await self.gather(tasks) @@ -85,14 +89,24 @@ class KitIpdCrawler(HttpCrawler): if not await self.crawl(path): return - tasks = [self._download_file(path, file) for file in folder.files] + tasks = [] + for file in folder.files: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl + # abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(file.url) + tasks.append(self._download_file(path, file, etag, mtime)) await self.gather(tasks) - async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + async def _download_file( + self, + parent: PurePath, + file: KitIpdFile, + etag: Optional[str], + mtime: Optional[datetime] + ) -> None: element_path = parent / file.name - etag, mtime = await self._request_resource_version(file.url) prev_etag = self._get_previous_etag_from_report(element_path) etag_differs = None if prev_etag is None else prev_etag != etag