Handle redirects

This commit is contained in:
Joscha 2022-09-30 01:18:51 +02:00
parent 23c7df3c43
commit 1db581725b

View file

@ -14,15 +14,24 @@ def process_page(page):
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page # https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
eprint(f"{page.id:8} - {page.title}") eprint(f"{page.id:8} - {page.title}")
info = {"id": page.id, "title": page.title}
if page.redirect:
info["redirect"] = page.redirect
else:
[revision] = list(page) # Every page has exactly one revision [revision] = list(page) # Every page has exactly one revision
parsed = wtp.parse(revision.text)
length = len(revision.text)
info["length"] = length
# Parsing may fail for articles with length 0
if length > 0:
links = [] links = []
for link in parsed.wikilinks: for link in wtp.parse(revision.text).wikilinks:
start, end = link.span start, end = link.span
links.append({"to": link.title, "start": start, "end": end}) links.append((link.title, start, end))
info["links"] = links
info = {"id": page.id, "title": page.title, "links": links}
print(json.dumps(info, check_circular=False, separators=(",", ":"))) print(json.dumps(info, check_circular=False, separators=(",", ":")))