import json import re import sys import mwxml # https://pythonhosted.org/mwxml/ import wikitextparser as wtp # https://github.com/5j9/wikitextparser#readme # A link can have two important properties: # 1. It can be inside a bigger structure (e.g. an infobox template) # 2. It can be inside parentheses # # The first link that is neither in parentheses nor part of any template is # considered the first link of the article for the purposes of the # Philosophy Game. # # The parentheses "(" and ")" are only recognized outside of certain # components like templates. # # https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy # https://diff.wikimedia.org/2018/04/20/why-it-took-a-long-time-to-build-that-tiny-link-preview-on-wikipedia/ # https://www.mediawiki.org/wiki/Page_Previews/API_Specification def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def advance_delims(delims, to): delta = 0 while delims: i, opening = delims[-1] if i <= to: delims.pop() delta += 1 if opening else -1 else: break return delta def find_structures(page): # These elements count as "structures". Within them, parentheses are ignored # and links count as "in a structure". structures = [] structures.extend(i.span for i in page.comments) structures.extend(i.span for i in page.external_links) structures.extend(i.span for i in page.get_tags()) # Usually structures.extend(i.span for i in page.tables) structures.extend(i.span for i in page.templates) structure_delims = [] structure_delims.extend((s, True) for s, _ in structures) structure_delims.extend((e, False) for _, e in structures) structure_delims.sort() return structure_delims def find_parens(page, structure_delims): structure_delims = list(reversed(sorted(structure_delims))) open_structures = 0 paren_delims = [] for m in re.finditer(r"\(|\)", page.string): start, end = m.span() open_structures += advance_delims(structure_delims, start) if open_structures != 0: continue opening = m.group(0) == "(" pos = start if opening else end paren_delims.append((pos, opening)) return paren_delims def fix_parens(paren_delims): # First, remove closing parens that close nonexistent opening parens. open_parens = 0 paren_delims_2 = [] for i, opening in paren_delims: if opening: open_parens += 1 paren_delims_2.append((i, opening)) elif open_parens > 0: open_parens -= 1 paren_delims_2.append((i, opening)) else: eprint(f"(removed weird closing paren at {i})") # Then, remove opening parens that would never be closed. open_parens = 0 paren_delims_3 = [] for i, opening in reversed(paren_delims_2): if not opening: open_parens += 1 paren_delims_3.append((i, opening)) elif open_parens > 0: open_parens -= 1 paren_delims_3.append((i, opening)) else: eprint(f"(removed weird opening paren at {i})") paren_delims_3.reverse() return paren_delims_3 def format_link(link, in_structure, in_parens): title = link.title.strip() start, end = link.span flags = in_structure << 1 | in_parens return (title, start, end - start, flags) def find_links(page, structure_delims, paren_delims): structure_delims = list(reversed(sorted(structure_delims))) paren_delims = list(reversed(sorted(paren_delims))) open_structures = 0 open_parens = 0 links = [] for link in page.wikilinks: start, end = link.span open_structures += advance_delims(structure_delims, start) open_parens += advance_delims(paren_delims, start) in_structure = open_structures > 0 in_parens = open_parens > 0 links.append(format_link(link, in_structure, in_parens)) return links def pair_parens(paren_delims): starts = [] spans = [] for i, opening in paren_delims: if opening: starts.append(i) else: spans.append((starts.pop(), i)) return spans def process_xmldump_page(page): # https://pythonhosted.org/mwxml/iteration.html#mwxml.Page if page.namespace != 0: return eprint(f"{page.id:8} - {page.title!r}") [revision] = list(page) # Every page has exactly one revision text = revision.text or "" parsed = wtp.parse(text) structure_delims = find_structures(parsed) paren_delims = find_parens(parsed, structure_delims) paren_delims_fixed = fix_parens(paren_delims) links = find_links(parsed, structure_delims, paren_delims_fixed) info = { "id": page.id, "title": page.title, "length": len(text), "links": links, } if page.redirect: info["redirect"] = page.redirect print(json.dumps(info, check_circular=False, separators=(",", ":"))) def main(): dump = mwxml.Dump.from_file(sys.stdin) for page in dump.pages: process_xmldump_page(page)