mirror of
https://github.com/Garmelon/PFERD.git
synced 2026-04-12 07:25:04 +02:00
Fix IPD crawler crashing on weird HTML comments
This commit is contained in:
parent
afbd03f777
commit
bc3fa36637
2 changed files with 10 additions and 1 deletions
|
|
@ -25,6 +25,9 @@ ambiguous situations.
|
||||||
### Changed
|
### Changed
|
||||||
- Add `cpp` extension to default `link_regex` of IPD crawler
|
- Add `cpp` extension to default `link_regex` of IPD crawler
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- IPD crawler crashes on some sites
|
||||||
|
|
||||||
## 3.4.0 - 2022-05-01
|
## 3.4.0 - 2022-05-01
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
|
||||||
|
|
@ -161,4 +161,10 @@ class KitIpdCrawler(HttpCrawler):
|
||||||
|
|
||||||
async def get_page(self) -> BeautifulSoup:
|
async def get_page(self) -> BeautifulSoup:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
return soupify(await request.read())
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
|
# hack enables those pages to be crawled, and should hopefully not
|
||||||
|
# cause issues on other pages.
|
||||||
|
content = (await request.read()).decode("utf-8")
|
||||||
|
content = re.sub(r"<!--.*?-->", "", content)
|
||||||
|
return soupify(content.encode("utf-8"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue