mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
moved file_regex in crawler section
This commit is contained in:
parent
deadfb2bf8
commit
c4f276b7ae
1 changed files with 7 additions and 7 deletions
|
|
@ -2,7 +2,8 @@ import os
|
|||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import PurePath
|
||||
from typing import List, Set, Union
|
||||
from re import Pattern
|
||||
from typing import List, Set, Union, AnyStr
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
|
@ -26,6 +27,10 @@ class KitIpdCrawlerSection(HttpCrawlerSection):
|
|||
|
||||
return target
|
||||
|
||||
def link_regex(self) -> Pattern[AnyStr]:
|
||||
regex = self.s.get("link_regex", default=".*/[^/]*\.(?:pdf|zip|c|java)")
|
||||
return re.compile(regex)
|
||||
|
||||
|
||||
@dataclass
|
||||
class KitIpdFile:
|
||||
|
|
@ -49,8 +54,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||
):
|
||||
super().__init__(name, section, config)
|
||||
self._url = section.target()
|
||||
self._config = filter(lambda t: t[0] == name, config.crawl_sections()).__next__()[1]
|
||||
self._file_regex = self._fetch_file_regex()
|
||||
self._file_regex = section.link_regex()
|
||||
|
||||
async def _run(self) -> None:
|
||||
maybe_cl = await self.crawl(PurePath("."))
|
||||
|
|
@ -122,10 +126,6 @@ class KitIpdCrawler(HttpCrawler):
|
|||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||
return tag.findAll(name="a", attrs={"href": self._file_regex})
|
||||
|
||||
def _fetch_file_regex(self) -> re.Pattern:
|
||||
if "link_regex" in self._config:
|
||||
return re.compile(self._config["link_regex"])
|
||||
return re.compile(".*\/[^\/]*\.(?:pdf|zip|c|java)")
|
||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||
return urljoin(self._url, link_tag.get("href"))
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue