Make json format more consistent
This commit is contained in:
parent
51096c99e1
commit
ecdeb4086a
1 changed files with 18 additions and 12 deletions
30
sift/sift.py
30
sift/sift.py
|
|
@ -12,22 +12,28 @@ def eprint(*args, **kwargs):
|
|||
|
||||
def process_page(page):
|
||||
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
||||
eprint(f"{page.namespace:4}: {page.id:8} - {page.title}")
|
||||
print(f"{page.namespace:4}: {page.id:8} - {page.title}")
|
||||
|
||||
info = {"ns": page.namespace, "id": page.id, "title": page.title}
|
||||
[revision] = list(page) # Every page has exactly one revision
|
||||
text = revision.text or ""
|
||||
|
||||
links = []
|
||||
for link in wtp.parse(text).wikilinks:
|
||||
start, end = link.span
|
||||
links.append((link.title, start, end))
|
||||
|
||||
info = {
|
||||
"ns": page.namespace,
|
||||
"id": page.id,
|
||||
"title": page.title,
|
||||
"length": len(text),
|
||||
"links": links,
|
||||
}
|
||||
|
||||
if page.redirect:
|
||||
assert len(links) == 1
|
||||
assert links[0] == page.redirect
|
||||
info["redirect"] = page.redirect
|
||||
else:
|
||||
[revision] = list(page) # Every page has exactly one revision
|
||||
if revision.text is not None:
|
||||
info["length"] = len(revision.text)
|
||||
|
||||
links = []
|
||||
for link in wtp.parse(revision.text).wikilinks:
|
||||
start, end = link.span
|
||||
links.append((link.title, start, end))
|
||||
info["links"] = links
|
||||
|
||||
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue