diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 689cca5..20440cf 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -2,7 +2,8 @@ import os import re from dataclasses import dataclass from pathlib import PurePath -from typing import List, Set, Union +from re import Pattern +from typing import List, Set, Union, AnyStr from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -26,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection): return target + def link_regex(self) -> Pattern[AnyStr]: + regex = self.s.get("link_regex", default=".*/[^/]*\.(?:pdf|zip|c|java)") + return re.compile(regex) + @dataclass class KitIpdFile: @@ -49,8 +54,7 @@ class KitIpdCrawler(HttpCrawler): ): super().__init__(name, section, config) self._url = section.target() - self._config = filter(lambda t: t[0] == name, config.crawl_sections()).__next__()[1] - self._file_regex = self._fetch_file_regex() + self._file_regex = section.link_regex() async def _run(self) -> None: maybe_cl = await self.crawl(PurePath(".")) @@ -122,10 +126,6 @@ class KitIpdCrawler(HttpCrawler): def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: return tag.findAll(name="a", attrs={"href": self._file_regex}) - def _fetch_file_regex(self) -> re.Pattern: - if "link_regex" in self._config: - return re.compile(self._config["link_regex"]) - return re.compile(".*\/[^\/]*\.(?:pdf|zip|c|java)") def _abs_url_from_link(self, link_tag: Tag) -> str: return urljoin(self._url, link_tag.get("href"))