From deadfb2bf887e5b1bc1ca7db548a17e62afa9ad4 Mon Sep 17 00:00:00 2001 From: Toorero <22551563+Toorero@users.noreply.github.com> Date: Sun, 31 Oct 2021 22:32:11 +0100 Subject: [PATCH] Simplified default regex --- PFERD/crawl/kit_ipd_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 2e47190..689cca5 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -125,7 +125,7 @@ class KitIpdCrawler(HttpCrawler): def _fetch_file_regex(self) -> re.Pattern: if "link_regex" in self._config: return re.compile(self._config["link_regex"]) - return re.compile(".*\/[^\/]*\.(?:(?:pdf)|(?:zip)|(?:c)|(?:java))") + return re.compile(".*\/[^\/]*\.(?:pdf|zip|c|java)") def _abs_url_from_link(self, link_tag: Tag) -> str: return urljoin(self._url, link_tag.get("href"))