Ignore all namespaces except 0

This commit is contained in:
Joscha 2022-10-03 16:26:08 +02:00
parent ecdeb4086a
commit 78a5aa5169

View file

@ -10,9 +10,11 @@ def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs) print(*args, file=sys.stderr, **kwargs)
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
def process_page(page): def process_page(page):
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page if page.namespace != 0:
print(f"{page.namespace:4}: {page.id:8} - {page.title}") return
eprint(f"{page.id:8} - {page.title}")
[revision] = list(page) # Every page has exactly one revision [revision] = list(page) # Every page has exactly one revision
text = revision.text or "" text = revision.text or ""
@ -20,10 +22,9 @@ def process_page(page):
links = [] links = []
for link in wtp.parse(text).wikilinks: for link in wtp.parse(text).wikilinks:
start, end = link.span start, end = link.span
links.append((link.title, start, end)) links.append((link.title.strip(), start, end))
info = { info = {
"ns": page.namespace,
"id": page.id, "id": page.id,
"title": page.title, "title": page.title,
"length": len(text), "length": len(text),
@ -31,8 +32,6 @@ def process_page(page):
} }
if page.redirect: if page.redirect:
assert len(links) == 1
assert links[0] == page.redirect
info["redirect"] = page.redirect info["redirect"] = page.redirect
print(json.dumps(info, check_circular=False, separators=(",", ":"))) print(json.dumps(info, check_circular=False, separators=(",", ":")))