From 7af2a4e06f00c6b881a90db7a6f0a60ab2838550 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 29 Dec 2024 20:48:52 +0100 Subject: [PATCH] Print nicer sift stats --- sift/sift.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sift/sift.py b/sift/sift.py index bde2e74..2562fa2 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -172,16 +172,21 @@ def process_xmldump_page(page): # Page info as simple tuples def simple_pages(input): dump = mwxml.Dump.from_file(sys.stdin) + articles = 0 for i, page in enumerate(dump.pages): + if (i + 1) % 1000 == 0: + # Yeah, the articles are usually off by one + eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}") + if page.namespace != 0: continue - if (i + 1) % 1000 == 0: - eprint(f"{i+1:8} pages, at pid {page.id:8}") - + articles += 1 [revision] = list(page) # Every page has exactly one revision yield page.id, page.title, revision.text or "", page.redirect + eprint(f"{articles} articles total") + def process_simple_page(info): pid, title, text, redirect = info