From 73064ea2b0ee78a4d9c8313973fb19d3d266a34f Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Sep 2022 00:39:44 +0200 Subject: [PATCH] Extract links from articles --- sift/pyproject.toml | 2 +- sift/sift.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/sift/pyproject.toml b/sift/pyproject.toml index 0e1da80..698e739 100644 --- a/sift/pyproject.toml +++ b/sift/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "sift" version = "0.1.0" description = "Sift through wikipedia dumps and extract interesting info" -dependencies = ["mwxml >= 0.3.3, < 0.4"] +dependencies = ["mwxml >= 0.3.3, < 0.4", "wikitextparser >= 0.51.0, < 0.52"] [project.scripts] sift = "sift:main" diff --git a/sift/sift.py b/sift/sift.py index ab75a81..27901a6 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -1,7 +1,9 @@ +import json import sys from pathlib import Path import mwxml # https://pythonhosted.org/mwxml/ +import wikitextparser as wtp # https://github.com/5j9/wikitextparser#readme def eprint(*args, **kwargs): @@ -11,8 +13,17 @@ def eprint(*args, **kwargs): def process_page(page): # https://pythonhosted.org/mwxml/iteration.html#mwxml.Page eprint(f"{page.id:8} - {page.title}") - if len(list(page)) != 1: - eprint(f"{page.id:8} - {page.title} - {len(list(page))}") + + [revision] = list(page) # Every page has exactly one revision + parsed = wtp.parse(revision.text) + + links = [] + for link in parsed.wikilinks: + start, end = link.span + links.append({"to": link.title, "start": start, "end": end}) + + info = {"id": page.id, "title": page.title, "links": links} + print(json.dumps(info, check_circular=False, separators=(",", ":"))) def main():