Rewrite ingest command
This commit is contained in:
parent
f819f5bf69
commit
3aa8222b6b
8 changed files with 502 additions and 242 deletions
46
brood/Cargo.lock
generated
46
brood/Cargo.lock
generated
|
|
@ -2,6 +2,15 @@
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "0.6.18"
|
version = "0.6.18"
|
||||||
|
|
@ -56,9 +65,11 @@ name = "brood"
|
||||||
version = "0.0.0"
|
version = "0.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
|
"regex",
|
||||||
"rustc-hash",
|
"rustc-hash",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"thousands",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -149,6 +160,35 @@ dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustc-hash"
|
name = "rustc-hash"
|
||||||
version = "2.1.0"
|
version = "2.1.0"
|
||||||
|
|
@ -210,6 +250,12 @@ dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thousands"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-ident"
|
name = "unicode-ident"
|
||||||
version = "1.0.14"
|
version = "1.0.14"
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
|
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
|
||||||
|
regex = "1.11.1"
|
||||||
rustc-hash = "2.1.0"
|
rustc-hash = "2.1.0"
|
||||||
serde = { version = "1.0.217", features = ["derive"] }
|
serde = { version = "1.0.217", features = ["derive"] }
|
||||||
serde_json = "1.0.134"
|
serde_json = "1.0.134"
|
||||||
|
thousands = "0.2.0"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1 @@
|
||||||
pub mod ingest;
|
pub mod ingest;
|
||||||
pub mod list_links;
|
|
||||||
pub mod list_pages;
|
|
||||||
pub mod longest_shortest_path;
|
|
||||||
pub mod path;
|
|
||||||
pub mod philosophy_game;
|
|
||||||
pub mod reexport;
|
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,18 @@
|
||||||
use std::collections::hash_map::Entry;
|
use std::{
|
||||||
use std::fs::File;
|
collections::{hash_map::Entry, HashMap},
|
||||||
use std::io::{self, BufRead, BufReader, BufWriter};
|
fs::File,
|
||||||
use std::path::Path;
|
io::{self, BufRead, BufReader, Seek},
|
||||||
use std::u32;
|
path::{Path, PathBuf},
|
||||||
|
};
|
||||||
|
|
||||||
use rustc_hash::FxHashMap;
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
use thousands::Separable;
|
||||||
|
|
||||||
use crate::data::adjacency_list::{AdjacencyList, Page};
|
use crate::{
|
||||||
use crate::data::info::{LinkInfo, PageInfo};
|
data::{self, Link, Page},
|
||||||
use crate::data::store;
|
graph::{Graph, NodeIdx},
|
||||||
use crate::util;
|
util::{Counter, TitleNormalizer},
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct JsonPage {
|
struct JsonPage {
|
||||||
|
|
@ -21,151 +23,139 @@ struct JsonPage {
|
||||||
redirect: Option<String>,
|
redirect: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
fn read_titles(r: &mut BufReader<File>) -> io::Result<Vec<String>> {
|
||||||
Importing is a tad complicated because of multiple criteria:
|
let mut counter = Counter::new();
|
||||||
|
let mut titles = vec![];
|
||||||
|
|
||||||
1. The data must be read in a single pass on stdin
|
for line in r.lines() {
|
||||||
2. The process should not consume a lot of memory
|
counter.tick();
|
||||||
(can't store the decoded json data directly)
|
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||||
3. The process should result in a nice and compact adjacency list format
|
titles.push(page.title);
|
||||||
|
|
||||||
Because of this, the import is a bit more complex and has two passes.
|
|
||||||
|
|
||||||
The first pass imports the data into an adjacency-list-like format, but the
|
|
||||||
`Link::to` field points to a title in `Titles` instead of a page.
|
|
||||||
|
|
||||||
The second pass then resolves the links to page indices and throws away all
|
|
||||||
links that don't point to any known page.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct Titles {
|
|
||||||
/// Normalized titles
|
|
||||||
titles: Vec<String>,
|
|
||||||
/// Map from normalized title to index in [`Self::titles`].
|
|
||||||
map: FxHashMap<String, u32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Titles {
|
|
||||||
fn insert(&mut self, title: String) -> u32 {
|
|
||||||
match self.map.entry(title.clone()) {
|
|
||||||
Entry::Occupied(occupied) => *occupied.get(),
|
|
||||||
Entry::Vacant(vacant) => {
|
|
||||||
let idx = self.titles.len() as u32;
|
|
||||||
self.titles.push(title);
|
|
||||||
vacant.insert(idx);
|
|
||||||
idx
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get(&self, i: u32) -> &str {
|
counter.done();
|
||||||
&self.titles[i as usize]
|
Ok(titles)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
fn compute_title_lookup(normalizer: &TitleNormalizer, titles: &[String]) -> HashMap<String, u32> {
|
||||||
let mut titles = Titles::default();
|
let mut counter = Counter::new();
|
||||||
let mut result = AdjacencyList::default();
|
let mut title_lookup = HashMap::new();
|
||||||
|
|
||||||
let stdin = BufReader::new(io::stdin());
|
for (i, title) in titles.iter().enumerate() {
|
||||||
for (i, line) in stdin.lines().enumerate() {
|
counter.tick();
|
||||||
let json_page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
match title_lookup.entry(normalizer.normalize(title)) {
|
||||||
|
Entry::Occupied(mut entry) => {
|
||||||
result.push_page(PageInfo {
|
let prev_i = *entry.get();
|
||||||
id: json_page.id,
|
let prev = &titles[prev_i as usize];
|
||||||
length: json_page.length,
|
if prev == title {
|
||||||
redirect: json_page.redirect.is_some(),
|
println!(" {title:?} ({prev_i}) occurs again at {i}");
|
||||||
title: json_page.title,
|
// Prefer later occurrences of articles over earlier ones under
|
||||||
});
|
// the assumption that their contents are "fresher".
|
||||||
|
entry.insert(i as u32);
|
||||||
if let Some(to) = json_page.redirect {
|
} else {
|
||||||
let to = titles.insert(util::normalize_link(&to));
|
println!(
|
||||||
result.push_link(to, LinkInfo::default());
|
" {prev:?} ({prev_i}) and {title:?} ({i}) both normalize to {:?}",
|
||||||
} else {
|
normalizer.normalize(title)
|
||||||
for (to, start, len, flags) in json_page.links {
|
);
|
||||||
let to = titles.insert(util::normalize_link(&to));
|
}
|
||||||
result.push_link(to, LinkInfo { start, len, flags });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i + 1) % 100_000 == 0 {
|
|
||||||
eprintln!("{} pages imported", i + 1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
eprintln!("Pages: {}", result.pages.len());
|
|
||||||
eprintln!("Links: {}", result.links.len());
|
|
||||||
eprintln!("Titles: {}", titles.titles.len());
|
|
||||||
eprintln!("Title map entries: {}", titles.map.len());
|
|
||||||
|
|
||||||
Ok((result, titles))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create map from normalized title to index in pages.
|
|
||||||
fn initialize_pages_map(pages: &[Page<PageInfo>]) -> FxHashMap<String, u32> {
|
|
||||||
let mut result = FxHashMap::default();
|
|
||||||
for (i, p) in pages.iter().enumerate() {
|
|
||||||
match result.entry(util::normalize_link(&p.data.title)) {
|
|
||||||
Entry::Occupied(entry) => {
|
|
||||||
eprintln!(
|
|
||||||
"{:?} already exists at index {} as {:?}",
|
|
||||||
p.data.title,
|
|
||||||
entry.get(),
|
|
||||||
util::normalize_link(&p.data.title)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
entry.insert(i as u32);
|
entry.insert(i as u32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result
|
|
||||||
|
counter.done();
|
||||||
|
title_lookup
|
||||||
}
|
}
|
||||||
|
|
||||||
fn second_stage(
|
fn read_page_data(
|
||||||
first_stage: &AdjacencyList<PageInfo, LinkInfo>,
|
normalizer: &TitleNormalizer,
|
||||||
titles: &Titles,
|
title_lookup: &HashMap<String, u32>,
|
||||||
) -> AdjacencyList<PageInfo, LinkInfo> {
|
r: &mut BufReader<File>,
|
||||||
let pages_map = initialize_pages_map(&first_stage.pages);
|
) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
||||||
let mut result = AdjacencyList::default();
|
let mut counter = Counter::new();
|
||||||
|
let mut pages = vec![];
|
||||||
|
let mut links = vec![];
|
||||||
|
let mut graph = Graph::new();
|
||||||
|
|
||||||
for (page_idx, page) in first_stage.pages() {
|
for (i, line) in r.lines().enumerate() {
|
||||||
result.push_page(page.data.clone());
|
counter.tick();
|
||||||
|
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||||
|
let normalized = normalizer.normalize(&page.title);
|
||||||
|
|
||||||
for (_, link) in first_stage.links(page_idx) {
|
let expected_i = title_lookup[&normalized];
|
||||||
let title = util::normalize_link(titles.get(link.to));
|
if i as u32 != expected_i {
|
||||||
if let Some(to) = pages_map.get(&title) {
|
// Articles may occur multiple times, and this is not the instance
|
||||||
// The link points to an existing article, we should keep it
|
// of the article we should keep.
|
||||||
result.push_link(*to, link.data);
|
println!(" Skipping {:?} ({i}) in favor of {expected_i}", page.title);
|
||||||
}
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (page_idx + 1) % 100_000 == 0 {
|
graph.add_node();
|
||||||
eprintln!("{} pages imported", page_idx + 1)
|
pages.push(Page {
|
||||||
|
id: page.id,
|
||||||
|
title: page.title,
|
||||||
|
length: page.length,
|
||||||
|
redirect: page.redirect.is_some(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut page_links = page.links;
|
||||||
|
if let Some(target) = page.redirect {
|
||||||
|
page_links.clear();
|
||||||
|
let len = target.len() as u32;
|
||||||
|
page_links.push((target, 0, len, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (target, start, len, flags) in page_links {
|
||||||
|
if let Some(target_i) = title_lookup.get(&normalizer.normalize(&target)) {
|
||||||
|
graph.edges.push(NodeIdx(*target_i));
|
||||||
|
links.push(Link { start, len, flags });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("Pages: {}", result.pages.len());
|
counter.done();
|
||||||
eprintln!("Links: {}", result.links.len());
|
Ok((pages, links, graph))
|
||||||
eprintln!("Page map entries: {}", pages_map.len());
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ingest(datafile: &Path) -> io::Result<()> {
|
/// Convert sift data to brood data.
|
||||||
eprintln!(">> First stage");
|
#[derive(Debug, clap::Parser)]
|
||||||
let (first_stage, titles) = first_stage()?;
|
pub struct Cmd {
|
||||||
|
/// The sift data file to ingest.
|
||||||
eprintln!(">> Second stage");
|
data: PathBuf,
|
||||||
let data = second_stage(&first_stage, &titles);
|
}
|
||||||
|
|
||||||
eprintln!(">> Consistency check");
|
impl Cmd {
|
||||||
data.check_consistency();
|
pub fn run(self, data: &Path) -> io::Result<()> {
|
||||||
|
let normalizer = TitleNormalizer::new();
|
||||||
eprintln!(">> Export");
|
|
||||||
let mut datafile = BufWriter::new(File::create(datafile)?);
|
println!(">> First pass");
|
||||||
store::write_adjacency_list(&data, &mut datafile)?;
|
let mut sift_data = BufReader::new(File::open(&self.data)?);
|
||||||
|
|
||||||
Ok(())
|
println!("> Reading titles");
|
||||||
|
let titles = read_titles(&mut sift_data)?;
|
||||||
|
|
||||||
|
println!("> Computing title index lookup table");
|
||||||
|
let title_lookup = compute_title_lookup(&normalizer, &titles);
|
||||||
|
drop(titles); // Don't hoard memory
|
||||||
|
|
||||||
|
println!(">> Second pass");
|
||||||
|
sift_data.seek(io::SeekFrom::Start(0))?;
|
||||||
|
|
||||||
|
println!("> Reading page data");
|
||||||
|
let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
||||||
|
drop(title_lookup); // Don't hoard memory
|
||||||
|
drop(sift_data); // No longer needed
|
||||||
|
|
||||||
|
println!("> Checking consistency");
|
||||||
|
graph.check_consistency();
|
||||||
|
|
||||||
|
println!(">> Export");
|
||||||
|
println!("Pages: {}", pages.len().separate_with_underscores());
|
||||||
|
println!("Links: {}", links.len().separate_with_underscores());
|
||||||
|
data::write_to_file(data, &pages, &links, &graph)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,177 @@
|
||||||
pub mod adjacency_list;
|
use std::{
|
||||||
pub mod info;
|
fs::File,
|
||||||
pub mod store;
|
io::{self, BufReader, BufWriter, Read, Write},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::graph::{EdgeIdx, Graph, NodeIdx};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Page {
|
||||||
|
pub id: u32,
|
||||||
|
pub title: String,
|
||||||
|
pub length: u32,
|
||||||
|
pub redirect: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy)]
|
||||||
|
pub struct Link {
|
||||||
|
pub start: u32,
|
||||||
|
pub len: u32,
|
||||||
|
pub flags: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Link {
|
||||||
|
pub fn in_parens(self) -> bool {
|
||||||
|
self.flags & 0b1 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn in_structure(self) -> bool {
|
||||||
|
self.flags & 0b10 != 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Store<'a, W>(&'a mut W);
|
||||||
|
|
||||||
|
fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> {
|
||||||
|
w.write_all(&n.to_le_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u8(r: &mut impl Read) -> io::Result<u8> {
|
||||||
|
let mut buf = [0_u8; 1];
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
Ok(u8::from_le_bytes(buf))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> {
|
||||||
|
w.write_all(&n.to_le_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u16(r: &mut impl Read) -> io::Result<u16> {
|
||||||
|
let mut buf = [0_u8; 2];
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
Ok(u16::from_le_bytes(buf))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> {
|
||||||
|
w.write_all(&n.to_le_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u32(r: &mut impl Read) -> io::Result<u32> {
|
||||||
|
let mut buf = [0_u8; 4];
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
Ok(u32::from_le_bytes(buf))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> {
|
||||||
|
assert!(s.len() <= u16::MAX as usize);
|
||||||
|
write_u16(w, s.len() as u16)?;
|
||||||
|
w.write_all(s.as_bytes())?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_str(r: &mut impl Read) -> io::Result<String> {
|
||||||
|
let len = read_u16(r)? as usize;
|
||||||
|
let mut buf = vec![0_u8; len];
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
Ok(String::from_utf8(buf).unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> {
|
||||||
|
write_u32(w, page.id)?;
|
||||||
|
write_u32(w, page.length)?;
|
||||||
|
write_u8(w, if page.redirect { 1 } else { 0 })?;
|
||||||
|
write_str(w, &page.title)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read_page(r: &mut impl Read) -> io::Result<Page> {
|
||||||
|
Ok(Page {
|
||||||
|
id: read_u32(r)?,
|
||||||
|
length: read_u32(r)?,
|
||||||
|
redirect: read_u8(r)? != 0,
|
||||||
|
title: read_str(r)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> {
|
||||||
|
write_u32(w, link.start)?;
|
||||||
|
write_u32(w, link.len)?;
|
||||||
|
write_u8(w, link.flags)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_link(r: &mut impl Read) -> io::Result<Link> {
|
||||||
|
Ok(Link {
|
||||||
|
start: read_u32(r)?,
|
||||||
|
len: read_u32(r)?,
|
||||||
|
flags: read_u8(r)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> {
|
||||||
|
assert!(pages.len() < u32::MAX as usize);
|
||||||
|
assert!(links.len() < u32::MAX as usize);
|
||||||
|
assert_eq!(pages.len(), graph.nodes.len());
|
||||||
|
assert_eq!(links.len(), graph.edges.len());
|
||||||
|
write_u32(w, pages.len() as u32)?;
|
||||||
|
write_u32(w, links.len() as u32)?;
|
||||||
|
|
||||||
|
for page in pages {
|
||||||
|
write_page(w, page)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for link in links {
|
||||||
|
write_link(w, link)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for node in &graph.nodes {
|
||||||
|
write_u32(w, node.0)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for edge in &graph.edges {
|
||||||
|
write_u32(w, edge.0)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read(r: &mut impl Read) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
||||||
|
let n_pages = read_u32(r)?;
|
||||||
|
let n_links = read_u32(r)?;
|
||||||
|
|
||||||
|
let mut pages = Vec::with_capacity(n_pages as usize);
|
||||||
|
let mut links = Vec::with_capacity(n_links as usize);
|
||||||
|
let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize);
|
||||||
|
|
||||||
|
for _ in 0..n_pages {
|
||||||
|
pages.push(read_page(r)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_links {
|
||||||
|
links.push(read_link(r)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_pages {
|
||||||
|
graph.nodes.push(EdgeIdx(read_u32(r)?));
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_links {
|
||||||
|
graph.edges.push(NodeIdx(read_u32(r)?));
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(pages.len(), graph.nodes.len());
|
||||||
|
assert_eq!(links.len(), graph.edges.len());
|
||||||
|
graph.check_consistency();
|
||||||
|
Ok((pages, links, graph))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> {
|
||||||
|
let mut file = BufWriter::new(File::create(path)?);
|
||||||
|
write(&mut file, pages, links, graph)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read_from_file(path: &Path) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
||||||
|
let mut file = BufReader::new(File::open(path)?);
|
||||||
|
read(&mut file)
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -196,6 +196,10 @@ impl Graph {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_node(&mut self) {
|
||||||
|
self.nodes.push(EdgeIdx::new(self.edges.len()));
|
||||||
|
}
|
||||||
|
|
||||||
pub fn check_consistency(&self) {
|
pub fn check_consistency(&self) {
|
||||||
if self.nodes.is_empty() {
|
if self.nodes.is_empty() {
|
||||||
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
|
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
|
||||||
|
|
|
||||||
|
|
@ -1,62 +1,16 @@
|
||||||
mod algo;
|
mod algo;
|
||||||
pub mod commands;
|
mod commands;
|
||||||
mod data;
|
mod data;
|
||||||
mod graph;
|
mod graph;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
use std::fs::File;
|
use std::{io, path::PathBuf};
|
||||||
use std::io::{self, BufReader};
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use data::store;
|
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Parser)]
|
|
||||||
pub enum PhilosophyGameCmd {
|
|
||||||
First,
|
|
||||||
Canonical,
|
|
||||||
Cluster,
|
|
||||||
Trace { start: String },
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Parser)]
|
#[derive(Debug, Parser)]
|
||||||
enum Command {
|
enum Command {
|
||||||
/// Read sift data on stdin and output brood data.
|
Ingest(commands::ingest::Cmd),
|
||||||
Ingest,
|
|
||||||
/// Read and reexport brood data.
|
|
||||||
Reexport {
|
|
||||||
to: PathBuf,
|
|
||||||
#[arg(long, short = 'P')]
|
|
||||||
in_parens: Option<bool>,
|
|
||||||
#[arg(long, short = 'S')]
|
|
||||||
in_structure: Option<bool>,
|
|
||||||
},
|
|
||||||
/// Find a path from one article to another.
|
|
||||||
Path {
|
|
||||||
from: String,
|
|
||||||
to: String,
|
|
||||||
/// Flip start and end article.
|
|
||||||
#[arg(short, long)]
|
|
||||||
flip: bool,
|
|
||||||
},
|
|
||||||
/// Find the longest shortest path starting at an article.
|
|
||||||
LongestShortestPath {
|
|
||||||
from: String,
|
|
||||||
},
|
|
||||||
/// Analyze articles using "Philosophy Game" rules.
|
|
||||||
PhilosophyGame {
|
|
||||||
#[command(subcommand)]
|
|
||||||
subcmd: PhilosophyGameCmd,
|
|
||||||
},
|
|
||||||
/// Print all page titles.
|
|
||||||
ListPages,
|
|
||||||
/// Print all links.
|
|
||||||
ListLinks {
|
|
||||||
/// The page to inspect.
|
|
||||||
page: String,
|
|
||||||
},
|
|
||||||
Test,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Parser)]
|
#[derive(Debug, Parser)]
|
||||||
|
|
@ -69,42 +23,6 @@ struct Args {
|
||||||
fn main() -> io::Result<()> {
|
fn main() -> io::Result<()> {
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
match args.command {
|
match args.command {
|
||||||
Command::Ingest => commands::ingest::ingest(&args.datafile),
|
Command::Ingest(cmd) => cmd.run(&args.datafile),
|
||||||
Command::Reexport {
|
|
||||||
to,
|
|
||||||
in_parens,
|
|
||||||
in_structure,
|
|
||||||
} => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure),
|
|
||||||
Command::Path { from, to, flip } => {
|
|
||||||
if flip {
|
|
||||||
commands::path::path(&args.datafile, &to, &from)
|
|
||||||
} else {
|
|
||||||
commands::path::path(&args.datafile, &from, &to)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Command::LongestShortestPath { from } => {
|
|
||||||
commands::longest_shortest_path::run(&args.datafile, &from)
|
|
||||||
}
|
|
||||||
Command::PhilosophyGame { subcmd } => {
|
|
||||||
commands::philosophy_game::run(&args.datafile, subcmd)
|
|
||||||
}
|
|
||||||
Command::ListPages => commands::list_pages::run(&args.datafile),
|
|
||||||
Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page),
|
|
||||||
Command::Test => test(&args.datafile),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test(datafile: &Path) -> io::Result<()> {
|
|
||||||
let a = Instant::now();
|
|
||||||
// println!(">> Import adjacency list");
|
|
||||||
// let mut databuf = BufReader::new(File::open(datafile)?);
|
|
||||||
// let adjlist = store::read_adjacency_list(&mut databuf)?;
|
|
||||||
println!(">> Import graph");
|
|
||||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
|
||||||
let (pages, links, graph) = store::read_graph(&mut databuf)?;
|
|
||||||
let b = Instant::now();
|
|
||||||
|
|
||||||
println!("{:?}", b.duration_since(a));
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,151 @@
|
||||||
use crate::data::{
|
use std::{fmt, iter, time::Instant};
|
||||||
adjacency_list::{AdjacencyList, Page},
|
|
||||||
info::{LinkInfo, PageInfo},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn normalize_link(link: &str) -> String {
|
use regex::Regex;
|
||||||
let link = link.trim().replace(' ', "_");
|
use thousands::Separable;
|
||||||
|
|
||||||
// Make only first char lowercase
|
pub struct Counter {
|
||||||
link.chars()
|
n: usize,
|
||||||
.next()
|
last_print: Instant,
|
||||||
.iter()
|
|
||||||
.flat_map(|c| c.to_lowercase())
|
|
||||||
.chain(link.chars().skip(1))
|
|
||||||
.collect::<String>()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Counter {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
n: 0,
|
||||||
|
last_print: Instant::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn tick(&mut self) {
|
||||||
|
self.n += 1;
|
||||||
|
if self.n % 10_000 != 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let now = Instant::now();
|
||||||
|
if now.duration_since(self.last_print).as_secs() < 4 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("{:>12}", self.n.separate_with_underscores());
|
||||||
|
self.last_print = now;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn done(&self) {
|
||||||
|
println!("{:>12} (done)", self.n.separate_with_underscores());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js
|
||||||
|
struct PhpCharToUpper(char);
|
||||||
|
|
||||||
|
impl fmt::Display for PhpCharToUpper {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self.0 {
|
||||||
|
// Do something special, I guess
|
||||||
|
'ᾀ' => write!(f, "ᾈ"),
|
||||||
|
'ᾁ' => write!(f, "ᾉ"),
|
||||||
|
'ᾂ' => write!(f, "ᾊ"),
|
||||||
|
'ᾃ' => write!(f, "ᾋ"),
|
||||||
|
'ᾄ' => write!(f, "ᾌ"),
|
||||||
|
'ᾅ' => write!(f, "ᾍ"),
|
||||||
|
'ᾆ' => write!(f, "ᾎ"),
|
||||||
|
'ᾇ' => write!(f, "ᾏ"),
|
||||||
|
'ᾐ' => write!(f, "ᾘ"),
|
||||||
|
'ᾑ' => write!(f, "ᾙ"),
|
||||||
|
'ᾒ' => write!(f, "ᾚ"),
|
||||||
|
'ᾓ' => write!(f, "ᾛ"),
|
||||||
|
'ᾔ' => write!(f, "ᾜ"),
|
||||||
|
'ᾕ' => write!(f, "ᾝ"),
|
||||||
|
'ᾖ' => write!(f, "ᾞ"),
|
||||||
|
'ᾗ' => write!(f, "ᾟ"),
|
||||||
|
'ᾠ' => write!(f, "ᾨ"),
|
||||||
|
'ᾡ' => write!(f, "ᾩ"),
|
||||||
|
'ᾢ' => write!(f, "ᾪ"),
|
||||||
|
'ᾣ' => write!(f, "ᾫ"),
|
||||||
|
'ᾤ' => write!(f, "ᾬ"),
|
||||||
|
'ᾥ' => write!(f, "ᾭ"),
|
||||||
|
'ᾦ' => write!(f, "ᾮ"),
|
||||||
|
'ᾧ' => write!(f, "ᾯ"),
|
||||||
|
'ᾳ' => write!(f, "ᾼ"),
|
||||||
|
'ῃ' => write!(f, "ῌ"),
|
||||||
|
'ῳ' => write!(f, "ῼ"),
|
||||||
|
|
||||||
|
// Do not capitalize
|
||||||
|
'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ'
|
||||||
|
| 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ'
|
||||||
|
| 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ'
|
||||||
|
| 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ'
|
||||||
|
| 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ'
|
||||||
|
| 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ'
|
||||||
|
| 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ'
|
||||||
|
| 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ'
|
||||||
|
| 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ'
|
||||||
|
| 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ'
|
||||||
|
| 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ'
|
||||||
|
| 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ'
|
||||||
|
| 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩'
|
||||||
|
| '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷'
|
||||||
|
| '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => {
|
||||||
|
write!(f, "{}", self.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Capitalize normally
|
||||||
|
c => write!(f, "{}", c.to_uppercase()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct TitleNormalizer {
|
||||||
|
strip_bidi: Regex,
|
||||||
|
clean_up_whitespace: Regex,
|
||||||
|
trim_underscore_start: Regex,
|
||||||
|
trim_underscore_end: Regex,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TitleNormalizer {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(),
|
||||||
|
|
||||||
|
clean_up_whitespace: Regex::new(concat!(
|
||||||
|
"[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}",
|
||||||
|
"\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+"
|
||||||
|
))
|
||||||
|
.unwrap(),
|
||||||
|
|
||||||
|
trim_underscore_start: Regex::new("^_+").unwrap(),
|
||||||
|
|
||||||
|
trim_underscore_end: Regex::new("_+$").unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize an article title.
|
||||||
|
///
|
||||||
|
/// See also <https://github.com/wikimedia/mediawiki-title>.
|
||||||
|
pub fn normalize(&self, title: &str) -> String {
|
||||||
|
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403
|
||||||
|
|
||||||
|
// Strip Unicode bidi override characters
|
||||||
|
let title = self.strip_bidi.replace_all(title, "");
|
||||||
|
|
||||||
|
// Clean up whitespace
|
||||||
|
let title = self.clean_up_whitespace.replace_all(&title, "_");
|
||||||
|
|
||||||
|
// Trim _ from beginning and end
|
||||||
|
let title = self.trim_underscore_start.replace_all(&title, "");
|
||||||
|
let title = self.trim_underscore_end.replace_all(&title, "");
|
||||||
|
|
||||||
|
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206
|
||||||
|
let Some(first) = title.chars().next() else {
|
||||||
|
return String::new();
|
||||||
|
};
|
||||||
|
let rest = &title[first.len_utf8()..];
|
||||||
|
format!("{}{rest}", PhpCharToUpper(first))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> u32 {
|
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> u32 {
|
||||||
let title = normalize_link(title);
|
let title = normalize_link(title);
|
||||||
pages
|
pages
|
||||||
|
|
@ -37,3 +168,4 @@ pub fn resolve_redirects(data: &AdjacencyList<PageInfo, LinkInfo>, mut page_idx:
|
||||||
return page_idx;
|
return page_idx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue