Ignore all namespaces except 0
This commit is contained in:
parent
ecdeb4086a
commit
78a5aa5169
1 changed files with 5 additions and 6 deletions
11
sift/sift.py
11
sift/sift.py
|
|
@ -10,9 +10,11 @@ def eprint(*args, **kwargs):
|
||||||
print(*args, file=sys.stderr, **kwargs)
|
print(*args, file=sys.stderr, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def process_page(page):
|
|
||||||
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
||||||
print(f"{page.namespace:4}: {page.id:8} - {page.title}")
|
def process_page(page):
|
||||||
|
if page.namespace != 0:
|
||||||
|
return
|
||||||
|
eprint(f"{page.id:8} - {page.title}")
|
||||||
|
|
||||||
[revision] = list(page) # Every page has exactly one revision
|
[revision] = list(page) # Every page has exactly one revision
|
||||||
text = revision.text or ""
|
text = revision.text or ""
|
||||||
|
|
@ -20,10 +22,9 @@ def process_page(page):
|
||||||
links = []
|
links = []
|
||||||
for link in wtp.parse(text).wikilinks:
|
for link in wtp.parse(text).wikilinks:
|
||||||
start, end = link.span
|
start, end = link.span
|
||||||
links.append((link.title, start, end))
|
links.append((link.title.strip(), start, end))
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
"ns": page.namespace,
|
|
||||||
"id": page.id,
|
"id": page.id,
|
||||||
"title": page.title,
|
"title": page.title,
|
||||||
"length": len(text),
|
"length": len(text),
|
||||||
|
|
@ -31,8 +32,6 @@ def process_page(page):
|
||||||
}
|
}
|
||||||
|
|
||||||
if page.redirect:
|
if page.redirect:
|
||||||
assert len(links) == 1
|
|
||||||
assert links[0] == page.redirect
|
|
||||||
info["redirect"] = page.redirect
|
info["redirect"] = page.redirect
|
||||||
|
|
||||||
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue