mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
Perform head requests sequentially
Real nice limiting is not easily possible with the current crawl/download abstraction sadly.
This commit is contained in:
parent
2193adadb4
commit
bb36571e42
1 changed files with 18 additions and 4 deletions
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
if isinstance(item, KitIpdFolder):
|
if isinstance(item, KitIpdFolder):
|
||||||
tasks.append(self._crawl_folder(item))
|
tasks.append(self._crawl_folder(item))
|
||||||
else:
|
else:
|
||||||
|
# do this here to at least be sequential and not parallel (rate limiting is hard, as the
|
||||||
|
# crawl abstraction does not hold for these requests)
|
||||||
|
etag, mtime = await self._request_resource_version(item.url)
|
||||||
# Orphan files are placed in the root folder
|
# Orphan files are placed in the root folder
|
||||||
tasks.append(self._download_file(PurePath("."), item))
|
tasks.append(self._download_file(PurePath("."), item, etag, mtime))
|
||||||
|
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
|
|
@ -85,14 +89,24 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
if not await self.crawl(path):
|
if not await self.crawl(path):
|
||||||
return
|
return
|
||||||
|
|
||||||
tasks = [self._download_file(path, file) for file in folder.files]
|
tasks = []
|
||||||
|
for file in folder.files:
|
||||||
|
# do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl
|
||||||
|
# abstraction does not hold for these requests)
|
||||||
|
etag, mtime = await self._request_resource_version(file.url)
|
||||||
|
tasks.append(self._download_file(path, file, etag, mtime))
|
||||||
|
|
||||||
await self.gather(tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
async def _download_file(
|
||||||
|
self,
|
||||||
|
parent: PurePath,
|
||||||
|
file: KitIpdFile,
|
||||||
|
etag: Optional[str],
|
||||||
|
mtime: Optional[datetime]
|
||||||
|
) -> None:
|
||||||
element_path = parent / file.name
|
element_path = parent / file.name
|
||||||
|
|
||||||
etag, mtime = await self._request_resource_version(file.url)
|
|
||||||
prev_etag = self._get_previous_etag_from_report(element_path)
|
prev_etag = self._get_previous_etag_from_report(element_path)
|
||||||
etag_differs = None if prev_etag is None else prev_etag != etag
|
etag_differs = None if prev_etag is None else prev_etag != etag
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue