44 lines
1.1 KiB
Python
44 lines
1.1 KiB
Python
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import mwxml # https://pythonhosted.org/mwxml/
|
|
import wikitextparser as wtp # https://github.com/5j9/wikitextparser#readme
|
|
|
|
|
|
def eprint(*args, **kwargs):
|
|
print(*args, file=sys.stderr, **kwargs)
|
|
|
|
|
|
def process_page(page):
|
|
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
|
print(f"{page.namespace:4}: {page.id:8} - {page.title}")
|
|
|
|
[revision] = list(page) # Every page has exactly one revision
|
|
text = revision.text or ""
|
|
|
|
links = []
|
|
for link in wtp.parse(text).wikilinks:
|
|
start, end = link.span
|
|
links.append((link.title, start, end))
|
|
|
|
info = {
|
|
"ns": page.namespace,
|
|
"id": page.id,
|
|
"title": page.title,
|
|
"length": len(text),
|
|
"links": links,
|
|
}
|
|
|
|
if page.redirect:
|
|
assert len(links) == 1
|
|
assert links[0] == page.redirect
|
|
info["redirect"] = page.redirect
|
|
|
|
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
|
|
|
|
|
def main():
|
|
dump = mwxml.Dump.from_file(sys.stdin)
|
|
for page in dump.pages:
|
|
process_page(page)
|