Extract links from articles

This commit is contained in:
Joscha 2022-09-30 00:39:44 +02:00
parent fe1db32c0e
commit 73064ea2b0
2 changed files with 14 additions and 3 deletions

View file

@ -1,7 +1,9 @@
import json
import sys
from pathlib import Path
import mwxml # https://pythonhosted.org/mwxml/
import wikitextparser as wtp # https://github.com/5j9/wikitextparser#readme
def eprint(*args, **kwargs):
@ -11,8 +13,17 @@ def eprint(*args, **kwargs):
def process_page(page):
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
eprint(f"{page.id:8} - {page.title}")
if len(list(page)) != 1:
eprint(f"{page.id:8} - {page.title} - {len(list(page))}")
[revision] = list(page) # Every page has exactly one revision
parsed = wtp.parse(revision.text)
links = []
for link in parsed.wikilinks:
start, end = link.span
links.append({"to": link.title, "start": start, "end": end})
info = {"id": page.id, "title": page.title, "links": links}
print(json.dumps(info, check_circular=False, separators=(",", ":")))
def main():