mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
Added regex config value for discovering file-links
This commit is contained in:
parent
be6214722b
commit
daf5e4e684
1 changed files with 7 additions and 1 deletions
|
|
@ -49,6 +49,8 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
):
|
):
|
||||||
super().__init__(name, section, config)
|
super().__init__(name, section, config)
|
||||||
self._url = section.target()
|
self._url = section.target()
|
||||||
|
self._config = filter(lambda t: t[0] == name, config.crawl_sections()).__next__()[1]
|
||||||
|
self._file_regex = self._fetch_file_regex()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
maybe_cl = await self.crawl(PurePath("."))
|
maybe_cl = await self.crawl(PurePath("."))
|
||||||
|
|
@ -119,8 +121,12 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": lambda x: x and ".pdf" in x})
|
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||||
|
|
||||||
|
def _fetch_file_regex(self) -> re.Pattern:
|
||||||
|
if "link_regex" in self._config:
|
||||||
|
return re.compile(self._config["link_regex"])
|
||||||
|
return re.compile(".*\/[^\/]*\.(?:(?:pdf)|(?:zip)|(?:c)|(?:java))")
|
||||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
return urljoin(self._url, link_tag.get("href"))
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue