moved file_regex in crawler section

This commit is contained in:
Toorero 2021-11-01 00:03:54 +01:00
parent deadfb2bf8
commit c4f276b7ae

View file

@ -2,7 +2,8 @@ import os
import re
from dataclasses import dataclass
from pathlib import PurePath
from typing import List, Set, Union
from re import Pattern
from typing import List, Set, Union, AnyStr
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag
@ -26,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
return target
def link_regex(self) -> Pattern[AnyStr]:
regex = self.s.get("link_regex", default=".*/[^/]*\.(?:pdf|zip|c|java)")
return re.compile(regex)
@dataclass
class KitIpdFile:
@ -49,8 +54,7 @@ class KitIpdCrawler(HttpCrawler):
):
super().__init__(name, section, config)
self._url = section.target()
self._config = filter(lambda t: t[0] == name, config.crawl_sections()).__next__()[1]
self._file_regex = self._fetch_file_regex()
self._file_regex = section.link_regex()
async def _run(self) -> None:
maybe_cl = await self.crawl(PurePath("."))
@ -122,10 +126,6 @@ class KitIpdCrawler(HttpCrawler):
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex})
def _fetch_file_regex(self) -> re.Pattern:
if "link_regex" in self._config:
return re.compile(self._config["link_regex"])
return re.compile(".*\/[^\/]*\.(?:pdf|zip|c|java)")
def _abs_url_from_link(self, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href"))