Extract links from articles
This commit is contained in:
parent
fe1db32c0e
commit
73064ea2b0
2 changed files with 14 additions and 3 deletions
|
|
@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
|
||||||
name = "sift"
|
name = "sift"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Sift through wikipedia dumps and extract interesting info"
|
description = "Sift through wikipedia dumps and extract interesting info"
|
||||||
dependencies = ["mwxml >= 0.3.3, < 0.4"]
|
dependencies = ["mwxml >= 0.3.3, < 0.4", "wikitextparser >= 0.51.0, < 0.52"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
sift = "sift:main"
|
sift = "sift:main"
|
||||||
|
|
|
||||||
15
sift/sift.py
15
sift/sift.py
|
|
@ -1,7 +1,9 @@
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import mwxml # https://pythonhosted.org/mwxml/
|
import mwxml # https://pythonhosted.org/mwxml/
|
||||||
|
import wikitextparser as wtp # https://github.com/5j9/wikitextparser#readme
|
||||||
|
|
||||||
|
|
||||||
def eprint(*args, **kwargs):
|
def eprint(*args, **kwargs):
|
||||||
|
|
@ -11,8 +13,17 @@ def eprint(*args, **kwargs):
|
||||||
def process_page(page):
|
def process_page(page):
|
||||||
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
# https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
|
||||||
eprint(f"{page.id:8} - {page.title}")
|
eprint(f"{page.id:8} - {page.title}")
|
||||||
if len(list(page)) != 1:
|
|
||||||
eprint(f"{page.id:8} - {page.title} - {len(list(page))}")
|
[revision] = list(page) # Every page has exactly one revision
|
||||||
|
parsed = wtp.parse(revision.text)
|
||||||
|
|
||||||
|
links = []
|
||||||
|
for link in parsed.wikilinks:
|
||||||
|
start, end = link.span
|
||||||
|
links.append({"to": link.title, "start": start, "end": end})
|
||||||
|
|
||||||
|
info = {"id": page.id, "title": page.title, "links": links}
|
||||||
|
print(json.dumps(info, check_circular=False, separators=(",", ":")))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue