From 0b2bcae44c219504bd2b4a8c5fc63e0a1ff6d7e9 Mon Sep 17 00:00:00 2001 From: Toorero <22551563+Toorero@users.noreply.github.com> Date: Mon, 25 Oct 2021 21:24:51 +0200 Subject: [PATCH] Changed behaviour of kit-ipd-crawler --- PFERD/crawl/kit_ipd_crawler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index 4d4addd..593f83a 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -88,17 +88,18 @@ class KitIpdCrawler(HttpCrawler): folder_tags: Set[Tag] = set() for element in elements: - enclosing_data: Tag = element.findParent(name="td") - label: Tag = enclosing_data.findPreviousSibling(name="td") + enclosing_table: Tag = element.findParent(name="table") + label: Tag = enclosing_table.findPreviousSibling(name="h3") folder_tags.add(label) + print(folder_tags) return folder_tags def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder: name = folder_tag.getText().strip() files: List[KitIpdFile] = [] - container: Tag = folder_tag.findNextSibling(name="td") + container: Tag = folder_tag.findNextSibling(name="table") for link in self._find_file_links(container): files.append(self._extract_file(link)) @@ -109,10 +110,9 @@ class KitIpdCrawler(HttpCrawler): return KitIpdFolder(name, files) def _extract_file(self, link: Tag) -> KitIpdFile: - name = link.getText().strip() url = self._abs_url_from_link(link) - _, extension = os.path.splitext(url) - return KitIpdFile(name + extension, url) + name = os.path.basename(url) + return KitIpdFile(name, url) def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]: return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})