moved file_regex in crawler section

This commit is contained in:
Toorero 2021-11-01 00:03:54 +01:00
parent deadfb2bf8
commit c4f276b7ae

View file

@ -2,7 +2,8 @@ import os
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from typing import List, Set, Union from re import Pattern
from typing import List, Set, Union, AnyStr
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -26,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
return target return target
def link_regex(self) -> Pattern[AnyStr]:
regex = self.s.get("link_regex", default=".*/[^/]*\.(?:pdf|zip|c|java)")
return re.compile(regex)
@dataclass @dataclass
class KitIpdFile: class KitIpdFile:
@ -49,8 +54,7 @@ class KitIpdCrawler(HttpCrawler):
): ):
super().__init__(name, section, config) super().__init__(name, section, config)
self._url = section.target() self._url = section.target()
self._config = filter(lambda t: t[0] == name, config.crawl_sections()).__next__()[1] self._file_regex = section.link_regex()
self._file_regex = self._fetch_file_regex()
async def _run(self) -> None: async def _run(self) -> None:
maybe_cl = await self.crawl(PurePath(".")) maybe_cl = await self.crawl(PurePath("."))
@ -122,10 +126,6 @@ class KitIpdCrawler(HttpCrawler):
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
return tag.findAll(name="a", attrs={"href": self._file_regex}) return tag.findAll(name="a", attrs={"href": self._file_regex})
def _fetch_file_regex(self) -> re.Pattern:
if "link_regex" in self._config:
return re.compile(self._config["link_regex"])
return re.compile(".*\/[^\/]*\.(?:pdf|zip|c|java)")
def _abs_url_from_link(self, link_tag: Tag) -> str: def _abs_url_from_link(self, link_tag: Tag) -> str:
return urljoin(self._url, link_tag.get("href")) return urljoin(self._url, link_tag.get("href"))