From 78a5aa51699a22f31383efffeda8ee36de6e6a2d Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 3 Oct 2022 16:26:08 +0200 Subject: [PATCH] Ignore all namespaces except 0 --- sift/sift.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sift/sift.py b/sift/sift.py index 3538983..d6c8844 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -10,9 +10,11 @@ def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) +# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page def process_page(page): - # https://pythonhosted.org/mwxml/iteration.html#mwxml.Page - print(f"{page.namespace:4}: {page.id:8} - {page.title}") + if page.namespace != 0: + return + eprint(f"{page.id:8} - {page.title}") [revision] = list(page) # Every page has exactly one revision text = revision.text or "" @@ -20,10 +22,9 @@ def process_page(page): links = [] for link in wtp.parse(text).wikilinks: start, end = link.span - links.append((link.title, start, end)) + links.append((link.title.strip(), start, end)) info = { - "ns": page.namespace, "id": page.id, "title": page.title, "length": len(text), @@ -31,8 +32,6 @@ def process_page(page): } if page.redirect: - assert len(links) == 1 - assert links[0] == page.redirect info["redirect"] = page.redirect print(json.dumps(info, check_circular=False, separators=(",", ":")))