From 09fcb19240c1e204f1f13aaf49dd373829bd37e5 Mon Sep 17 00:00:00 2001 From: Toorero <22551563+Toorero@users.noreply.github.com> Date: Mon, 1 Nov 2021 01:57:33 +0100 Subject: [PATCH] also query files without a root folder --- PFERD/crawl/kit_ipd_crawler.py | 39 ++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 6af22f6..1ed5ffe 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from pathlib import PurePath from re import Pattern -from typing import List, Set, Union, AnyStr +from typing import List, Set, Union, AnyStr, Optional from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -95,22 +95,28 @@ class KitIpdCrawler(HttpCrawler): folder_tags: Set[Tag] = set() for element in elements: - enclosing_table: Tag = element.findParent(name="table") - if (enclosing_table) is None: - continue - label: Tag = enclosing_table.findPreviousSibling(name=re.compile('^h[1-6]$')) - if label is not None: - folder_tags.add(label) + folder_label = self._fetch_folder_label(element) + if folder_label is None: + folder_tags.add(page) + else: + folder_tags.add(folder_label) return folder_tags def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: - name = folder_tag.getText().strip() files: List[KitIpdFile] = [] + # if files have found outside a regular table + if not folder_tag.name.startswith("h"): + name = "." + root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag)) + for link in root_links: + files.append(self._extract_file(link)) - container: Tag = folder_tag.findNextSibling(name="table") - for link in self._find_file_links(container): - files.append(self._extract_file(link)) + else: + name = folder_tag.getText().strip() + container: Tag = folder_tag.findNextSibling(name="table") + for link in self._find_file_links(container): + files.append(self._extract_file(link)) log.explain_topic(f"Found folder {name!r}") for file in files: @@ -118,6 +124,17 @@ class KitIpdCrawler(HttpCrawler): return KitIpdFolder(name, files) + @staticmethod + def _fetch_folder_label(file_link: Tag) -> Optional[Tag]: + enclosing_table: Tag = file_link.findParent(name="table") + if enclosing_table is None: + return None + label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$")) + if label is None: + return None + else: + return label + def _extract_file(self, link: Tag) -> KitIpdFile: url = self._abs_url_from_link(link) name = os.path.basename(url)