also query files without a root folder

This commit is contained in:
Toorero 2021-11-01 01:57:33 +01:00
parent 9f7aff761d
commit 09fcb19240

View file

@ -3,7 +3,7 @@ import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import PurePath from pathlib import PurePath
from re import Pattern from re import Pattern
from typing import List, Set, Union, AnyStr from typing import List, Set, Union, AnyStr, Optional
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -95,19 +95,25 @@ class KitIpdCrawler(HttpCrawler):
folder_tags: Set[Tag] = set() folder_tags: Set[Tag] = set()
for element in elements: for element in elements:
enclosing_table: Tag = element.findParent(name="table") folder_label = self._fetch_folder_label(element)
if (enclosing_table) is None: if folder_label is None:
continue folder_tags.add(page)
label: Tag = enclosing_table.findPreviousSibling(name=re.compile('^h[1-6]$')) else:
if label is not None: folder_tags.add(folder_label)
folder_tags.add(label)
return folder_tags return folder_tags
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
name = folder_tag.getText().strip()
files: List[KitIpdFile] = [] files: List[KitIpdFile] = []
# if files have found outside a regular table
if not folder_tag.name.startswith("h"):
name = "."
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
for link in root_links:
files.append(self._extract_file(link))
else:
name = folder_tag.getText().strip()
container: Tag = folder_tag.findNextSibling(name="table") container: Tag = folder_tag.findNextSibling(name="table")
for link in self._find_file_links(container): for link in self._find_file_links(container):
files.append(self._extract_file(link)) files.append(self._extract_file(link))
@ -118,6 +124,17 @@ class KitIpdCrawler(HttpCrawler):
return KitIpdFolder(name, files) return KitIpdFolder(name, files)
@staticmethod
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]:
enclosing_table: Tag = file_link.findParent(name="table")
if enclosing_table is None:
return None
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
if label is None:
return None
else:
return label
def _extract_file(self, link: Tag) -> KitIpdFile: def _extract_file(self, link: Tag) -> KitIpdFile:
url = self._abs_url_from_link(link) url = self._abs_url_from_link(link)
name = os.path.basename(url) name = os.path.basename(url)