Print nicer sift stats
This commit is contained in:
parent
27416cf782
commit
7af2a4e06f
1 changed files with 8 additions and 3 deletions
11
sift/sift.py
11
sift/sift.py
|
|
@ -172,16 +172,21 @@ def process_xmldump_page(page):
|
||||||
# Page info as simple tuples
|
# Page info as simple tuples
|
||||||
def simple_pages(input):
|
def simple_pages(input):
|
||||||
dump = mwxml.Dump.from_file(sys.stdin)
|
dump = mwxml.Dump.from_file(sys.stdin)
|
||||||
|
articles = 0
|
||||||
for i, page in enumerate(dump.pages):
|
for i, page in enumerate(dump.pages):
|
||||||
|
if (i + 1) % 1000 == 0:
|
||||||
|
# Yeah, the articles are usually off by one
|
||||||
|
eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}")
|
||||||
|
|
||||||
if page.namespace != 0:
|
if page.namespace != 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (i + 1) % 1000 == 0:
|
articles += 1
|
||||||
eprint(f"{i+1:8} pages, at pid {page.id:8}")
|
|
||||||
|
|
||||||
[revision] = list(page) # Every page has exactly one revision
|
[revision] = list(page) # Every page has exactly one revision
|
||||||
yield page.id, page.title, revision.text or "", page.redirect
|
yield page.id, page.title, revision.text or "", page.redirect
|
||||||
|
|
||||||
|
eprint(f"{articles} articles total")
|
||||||
|
|
||||||
|
|
||||||
def process_simple_page(info):
|
def process_simple_page(info):
|
||||||
pid, title, text, redirect = info
|
pid, title, text, redirect = info
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue