mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 23:45:05 +02:00
More general search pattern, only focusing on .pdf links
This commit is contained in:
parent
0b2bcae44c
commit
ac35dd9a47
1 changed files with 5 additions and 2 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import List, Set, Union
|
from typing import List, Set, Union
|
||||||
|
|
@ -89,7 +90,9 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
enclosing_table: Tag = element.findParent(name="table")
|
enclosing_table: Tag = element.findParent(name="table")
|
||||||
label: Tag = enclosing_table.findPreviousSibling(name="h3")
|
if (enclosing_table) is None:
|
||||||
|
continue
|
||||||
|
label: Tag = enclosing_table.findPreviousSibling(name=re.compile('^h[1-6]$'))
|
||||||
folder_tags.add(label)
|
folder_tags.add(label)
|
||||||
|
|
||||||
print(folder_tags)
|
print(folder_tags)
|
||||||
|
|
@ -115,7 +118,7 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
return KitIpdFile(name, url)
|
return KitIpdFile(name, url)
|
||||||
|
|
||||||
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
|
||||||
return tag.findAll(name="a", attrs={"href": lambda x: x and "intern" in x})
|
return tag.findAll(name="a", attrs={"href": lambda x: x and ".pdf" in x})
|
||||||
|
|
||||||
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
def _abs_url_from_link(self, link_tag: Tag) -> str:
|
||||||
return urljoin(self._url, link_tag.get("href"))
|
return urljoin(self._url, link_tag.get("href"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue