Handle redirects
This commit is contained in:
parent
23c7df3c43
commit
1db581725b
1 changed files with 16 additions and 7 deletions
17
sift/sift.py
17
sift/sift.py
|
|
@ -14,15 +14,24 @@ def process_page(page):
|
||||||
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
||||||
eprint(f"{page.id:8} - {page.title}")
|
eprint(f"{page.id:8} - {page.title}")
|
||||||
|
|
||||||
|
info = {"id": page.id, "title": page.title}
|
||||||
|
|
||||||
|
if page.redirect:
|
||||||
|
info["redirect"] = page.redirect
|
||||||
|
else:
|
||||||
[revision] = list(page) # Every page has exactly one revision
|
[revision] = list(page) # Every page has exactly one revision
|
||||||
parsed = wtp.parse(revision.text)
|
|
||||||
|
|
||||||
|
length = len(revision.text)
|
||||||
|
info["length"] = length
|
||||||
|
|
||||||
|
# Parsing may fail for articles with length 0
|
||||||
|
if length > 0:
|
||||||
links = []
|
links = []
|
||||||
for link in parsed.wikilinks:
|
for link in wtp.parse(revision.text).wikilinks:
|
||||||
start, end = link.span
|
start, end = link.span
|
||||||
links.append({"to": link.title, "start": start, "end": end})
|
links.append((link.title, start, end))
|
||||||
|
info["links"] = links
|
||||||
|
|
||||||
info = {"id": page.id, "title": page.title, "links": links}
|
|
||||||
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue