Print nicer sift stats
This commit is contained in:
parent
27416cf782
commit
7af2a4e06f
1 changed files with 8 additions and 3 deletions
11
sift/sift.py
11
sift/sift.py
|
|
@ -172,16 +172,21 @@ def process_xmldump_page(page):
|
|||
# Page info as simple tuples
|
||||
def simple_pages(input):
|
||||
dump = mwxml.Dump.from_file(sys.stdin)
|
||||
articles = 0
|
||||
for i, page in enumerate(dump.pages):
|
||||
if (i + 1) % 1000 == 0:
|
||||
# Yeah, the articles are usually off by one
|
||||
eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}")
|
||||
|
||||
if page.namespace != 0:
|
||||
continue
|
||||
|
||||
if (i + 1) % 1000 == 0:
|
||||
eprint(f"{i+1:8} pages, at pid {page.id:8}")
|
||||
|
||||
articles += 1
|
||||
[revision] = list(page) # Every page has exactly one revision
|
||||
yield page.id, page.title, revision.text or "", page.redirect
|
||||
|
||||
eprint(f"{articles} articles total")
|
||||
|
||||
|
||||
def process_simple_page(info):
|
||||
pid, title, text, redirect = info
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue