Extract links from articles

2022-09-30 00:39:44 +02:00 · 2022-09-30 00:39:44 +02:00 · 73064ea2b0
commit 73064ea2b0
parent fe1db32c0e
2 changed files with 14 additions and 3 deletions
--- a/sift/pyproject.toml
+++ b/sift/pyproject.toml
@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "sift"
 version = "0.1.0"
 description = "Sift through wikipedia dumps and extract interesting info"
-dependencies = ["mwxml >= 0.3.3, < 0.4"]
+dependencies = ["mwxml >= 0.3.3, < 0.4", "wikitextparser >= 0.51.0, < 0.52"]

 [project.scripts]
 sift = "sift:main"
--- a/sift/sift.py
+++ b/sift/sift.py
@ -1,7 +1,9 @@
+import json
 import sys
 from pathlib import Path

 import mwxml  # https://pythonhosted.org/mwxml/
+import wikitextparser as wtp  # https://github.com/5j9/wikitextparser#readme


 def eprint(*args, **kwargs):
@ -11,8 +13,17 @@ def eprint(*args, **kwargs):
 def process_page(page):
    # https://pythonhosted.org/mwxml/iteration.html#mwxml.Page
    eprint(f"{page.id:8} - {page.title}")
-    if len(list(page)) != 1:
-        eprint(f"{page.id:8} - {page.title} - {len(list(page))}")
+
+    [revision] = list(page)  # Every page has exactly one revision
+    parsed = wtp.parse(revision.text)
+
+    links = []
+    for link in parsed.wikilinks:
+        start, end = link.span
+        links.append({"to": link.title, "start": start, "end": end})
+
+    info = {"id": page.id, "title": page.title, "links": links}
+    print(json.dumps(info, check_circular=False, separators=(",", ":")))


 def main():