Perform head requests sequentially

Real nice limiting is not easily possible with the current crawl/download abstraction sadly.
2026-04-12 15:35:05 +02:00 · 2024-10-27 17:30:00 +01:00 · 2024-10-27 17:30:00 +01:00 · bb36571e42
commit bb36571e42
parent 2193adadb4
1 changed files with 18 additions and 4 deletions
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@ -1,6 +1,7 @@
 import os
 import re
 from dataclasses import dataclass
+from datetime import datetime
 from pathlib import PurePath
 from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
 from urllib.parse import urljoin
@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler):
                if isinstance(item, KitIpdFolder):
                    tasks.append(self._crawl_folder(item))
                else:
+                    # do this here to at least be sequential and not parallel (rate limiting is hard, as the
+                    # crawl abstraction does not hold for these requests)
+                    etag, mtime = await self._request_resource_version(item.url)
                    # Orphan files are placed in the root folder
-                    tasks.append(self._download_file(PurePath("."), item))
+                    tasks.append(self._download_file(PurePath("."), item, etag, mtime))

        await self.gather(tasks)

@ -85,14 +89,24 @@ class KitIpdCrawler(HttpCrawler):
        if not await self.crawl(path):
            return

-        tasks = [self._download_file(path, file) for file in folder.files]
+        tasks = []
+        for file in folder.files:
+            # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl
+            # abstraction does not hold for these requests)
+            etag, mtime = await self._request_resource_version(file.url)
+            tasks.append(self._download_file(path, file, etag, mtime))

        await self.gather(tasks)

-    async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
+    async def _download_file(
+        self,
+        parent: PurePath,
+        file: KitIpdFile,
+        etag: Optional[str],
+        mtime: Optional[datetime]
+    ) -> None:
        element_path = parent / file.name

-        etag, mtime = await self._request_resource_version(file.url)
        prev_etag = self._get_previous_etag_from_report(element_path)
        etag_differs = None if prev_etag is None else prev_etag != etag