mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 15:35:05 +02:00
also query files without a root folder
This commit is contained in:
parent
9f7aff761d
commit
09fcb19240
1 changed files with 28 additions and 11 deletions
|
|
@ -3,7 +3,7 @@ import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from re import Pattern
|
from re import Pattern
|
||||||
from typing import List, Set, Union, AnyStr
|
from typing import List, Set, Union, AnyStr, Optional
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
@ -95,22 +95,28 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
folder_tags: Set[Tag] = set()
|
folder_tags: Set[Tag] = set()
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
enclosing_table: Tag = element.findParent(name="table")
|
folder_label = self._fetch_folder_label(element)
|
||||||
if (enclosing_table) is None:
|
if folder_label is None:
|
||||||
continue
|
folder_tags.add(page)
|
||||||
label: Tag = enclosing_table.findPreviousSibling(name=re.compile('^h[1-6]$'))
|
else:
|
||||||
if label is not None:
|
folder_tags.add(folder_label)
|
||||||
folder_tags.add(label)
|
|
||||||
|
|
||||||
return folder_tags
|
return folder_tags
|
||||||
|
|
||||||
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
|
||||||
name = folder_tag.getText().strip()
|
|
||||||
files: List[KitIpdFile] = []
|
files: List[KitIpdFile] = []
|
||||||
|
# if files have found outside a regular table
|
||||||
|
if not folder_tag.name.startswith("h"):
|
||||||
|
name = "."
|
||||||
|
root_links = filter(lambda f: self._fetch_folder_label(f) is None, self._find_file_links(folder_tag))
|
||||||
|
for link in root_links:
|
||||||
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
container: Tag = folder_tag.findNextSibling(name="table")
|
else:
|
||||||
for link in self._find_file_links(container):
|
name = folder_tag.getText().strip()
|
||||||
files.append(self._extract_file(link))
|
container: Tag = folder_tag.findNextSibling(name="table")
|
||||||
|
for link in self._find_file_links(container):
|
||||||
|
files.append(self._extract_file(link))
|
||||||
|
|
||||||
log.explain_topic(f"Found folder {name!r}")
|
log.explain_topic(f"Found folder {name!r}")
|
||||||
for file in files:
|
for file in files:
|
||||||
|
|
@ -118,6 +124,17 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
return KitIpdFolder(name, files)
|
return KitIpdFolder(name, files)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fetch_folder_label(file_link: Tag) -> Optional[Tag]:
|
||||||
|
enclosing_table: Tag = file_link.findParent(name="table")
|
||||||
|
if enclosing_table is None:
|
||||||
|
return None
|
||||||
|
label: Tag = enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
|
||||||
|
if label is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return label
|
||||||
|
|
||||||
def _extract_file(self, link: Tag) -> KitIpdFile:
|
def _extract_file(self, link: Tag) -> KitIpdFile:
|
||||||
url = self._abs_url_from_link(link)
|
url = self._abs_url_from_link(link)
|
||||||
name = os.path.basename(url)
|
name = os.path.basename(url)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue