diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 813574a..16c8cc8 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.18" @@ -56,9 +65,11 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", + "regex", "rustc-hash", "serde", "serde_json", + "thousands", ] [[package]] @@ -149,6 +160,35 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "rustc-hash" version = "2.1.0" @@ -210,6 +250,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + [[package]] name = "unicode-ident" version = "1.0.14" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index f53334d..a560f73 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,6 +5,8 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } +regex = "1.11.1" rustc-hash = "2.1.0" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" +thousands = "0.2.0" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6da3050..b3ac910 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,7 +1 @@ pub mod ingest; -pub mod list_links; -pub mod list_pages; -pub mod longest_shortest_path; -pub mod path; -pub mod philosophy_game; -pub mod reexport; diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index cda10d0..5407a8b 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,16 +1,18 @@ -use std::collections::hash_map::Entry; -use std::fs::File; -use std::io::{self, BufRead, BufReader, BufWriter}; -use std::path::Path; -use std::u32; +use std::{ + collections::{hash_map::Entry, HashMap}, + fs::File, + io::{self, BufRead, BufReader, Seek}, + path::{Path, PathBuf}, +}; -use rustc_hash::FxHashMap; use serde::Deserialize; +use thousands::Separable; -use crate::data::adjacency_list::{AdjacencyList, Page}; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; +use crate::{ + data::{self, Link, Page}, + graph::{Graph, NodeIdx}, + util::{Counter, TitleNormalizer}, +}; #[derive(Deserialize)] struct JsonPage { @@ -21,151 +23,139 @@ struct JsonPage { redirect: Option, } -/* -Importing is a tad complicated because of multiple criteria: +fn read_titles(r: &mut BufReader) -> io::Result> { + let mut counter = Counter::new(); + let mut titles = vec![]; -1. The data must be read in a single pass on stdin -2. The process should not consume a lot of memory - (can't store the decoded json data directly) -3. The process should result in a nice and compact adjacency list format - -Because of this, the import is a bit more complex and has two passes. - -The first pass imports the data into an adjacency-list-like format, but the -`Link::to` field points to a title in `Titles` instead of a page. - -The second pass then resolves the links to page indices and throws away all -links that don't point to any known page. -*/ - -#[derive(Default)] -struct Titles { - /// Normalized titles - titles: Vec, - /// Map from normalized title to index in [`Self::titles`]. - map: FxHashMap, -} - -impl Titles { - fn insert(&mut self, title: String) -> u32 { - match self.map.entry(title.clone()) { - Entry::Occupied(occupied) => *occupied.get(), - Entry::Vacant(vacant) => { - let idx = self.titles.len() as u32; - self.titles.push(title); - vacant.insert(idx); - idx - } - } + for line in r.lines() { + counter.tick(); + let page = serde_json::from_str::(&line?).unwrap(); + titles.push(page.title); } - fn get(&self, i: u32) -> &str { - &self.titles[i as usize] - } + counter.done(); + Ok(titles) } -fn first_stage() -> io::Result<(AdjacencyList, Titles)> { - let mut titles = Titles::default(); - let mut result = AdjacencyList::default(); +fn compute_title_lookup(normalizer: &TitleNormalizer, titles: &[String]) -> HashMap { + let mut counter = Counter::new(); + let mut title_lookup = HashMap::new(); - let stdin = BufReader::new(io::stdin()); - for (i, line) in stdin.lines().enumerate() { - let json_page = serde_json::from_str::(&line?).unwrap(); - - result.push_page(PageInfo { - id: json_page.id, - length: json_page.length, - redirect: json_page.redirect.is_some(), - title: json_page.title, - }); - - if let Some(to) = json_page.redirect { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo::default()); - } else { - for (to, start, len, flags) in json_page.links { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo { start, len, flags }); - } - } - - if (i + 1) % 100_000 == 0 { - eprintln!("{} pages imported", i + 1) - } - } - - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Titles: {}", titles.titles.len()); - eprintln!("Title map entries: {}", titles.map.len()); - - Ok((result, titles)) -} - -/// Create map from normalized title to index in pages. -fn initialize_pages_map(pages: &[Page]) -> FxHashMap { - let mut result = FxHashMap::default(); - for (i, p) in pages.iter().enumerate() { - match result.entry(util::normalize_link(&p.data.title)) { - Entry::Occupied(entry) => { - eprintln!( - "{:?} already exists at index {} as {:?}", - p.data.title, - entry.get(), - util::normalize_link(&p.data.title) - ); + for (i, title) in titles.iter().enumerate() { + counter.tick(); + match title_lookup.entry(normalizer.normalize(title)) { + Entry::Occupied(mut entry) => { + let prev_i = *entry.get(); + let prev = &titles[prev_i as usize]; + if prev == title { + println!(" {title:?} ({prev_i}) occurs again at {i}"); + // Prefer later occurrences of articles over earlier ones under + // the assumption that their contents are "fresher". + entry.insert(i as u32); + } else { + println!( + " {prev:?} ({prev_i}) and {title:?} ({i}) both normalize to {:?}", + normalizer.normalize(title) + ); + } } Entry::Vacant(entry) => { entry.insert(i as u32); } } } - result + + counter.done(); + title_lookup } -fn second_stage( - first_stage: &AdjacencyList, - titles: &Titles, -) -> AdjacencyList { - let pages_map = initialize_pages_map(&first_stage.pages); - let mut result = AdjacencyList::default(); +fn read_page_data( + normalizer: &TitleNormalizer, + title_lookup: &HashMap, + r: &mut BufReader, +) -> io::Result<(Vec, Vec, Graph)> { + let mut counter = Counter::new(); + let mut pages = vec![]; + let mut links = vec![]; + let mut graph = Graph::new(); - for (page_idx, page) in first_stage.pages() { - result.push_page(page.data.clone()); + for (i, line) in r.lines().enumerate() { + counter.tick(); + let page = serde_json::from_str::(&line?).unwrap(); + let normalized = normalizer.normalize(&page.title); - for (_, link) in first_stage.links(page_idx) { - let title = util::normalize_link(titles.get(link.to)); - if let Some(to) = pages_map.get(&title) { - // The link points to an existing article, we should keep it - result.push_link(*to, link.data); - } + let expected_i = title_lookup[&normalized]; + if i as u32 != expected_i { + // Articles may occur multiple times, and this is not the instance + // of the article we should keep. + println!(" Skipping {:?} ({i}) in favor of {expected_i}", page.title); + continue; } - if (page_idx + 1) % 100_000 == 0 { - eprintln!("{} pages imported", page_idx + 1) + graph.add_node(); + pages.push(Page { + id: page.id, + title: page.title, + length: page.length, + redirect: page.redirect.is_some(), + }); + + let mut page_links = page.links; + if let Some(target) = page.redirect { + page_links.clear(); + let len = target.len() as u32; + page_links.push((target, 0, len, 0)); + } + + for (target, start, len, flags) in page_links { + if let Some(target_i) = title_lookup.get(&normalizer.normalize(&target)) { + graph.edges.push(NodeIdx(*target_i)); + links.push(Link { start, len, flags }); + } } } - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Page map entries: {}", pages_map.len()); - - result + counter.done(); + Ok((pages, links, graph)) } -pub fn ingest(datafile: &Path) -> io::Result<()> { - eprintln!(">> First stage"); - let (first_stage, titles) = first_stage()?; - - eprintln!(">> Second stage"); - let data = second_stage(&first_stage, &titles); - - eprintln!(">> Consistency check"); - data.check_consistency(); - - eprintln!(">> Export"); - let mut datafile = BufWriter::new(File::create(datafile)?); - store::write_adjacency_list(&data, &mut datafile)?; - - Ok(()) +/// Convert sift data to brood data. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + /// The sift data file to ingest. + data: PathBuf, +} + +impl Cmd { + pub fn run(self, data: &Path) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> First pass"); + let mut sift_data = BufReader::new(File::open(&self.data)?); + + println!("> Reading titles"); + let titles = read_titles(&mut sift_data)?; + + println!("> Computing title index lookup table"); + let title_lookup = compute_title_lookup(&normalizer, &titles); + drop(titles); // Don't hoard memory + + println!(">> Second pass"); + sift_data.seek(io::SeekFrom::Start(0))?; + + println!("> Reading page data"); + let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + drop(title_lookup); // Don't hoard memory + drop(sift_data); // No longer needed + + println!("> Checking consistency"); + graph.check_consistency(); + + println!(">> Export"); + println!("Pages: {}", pages.len().separate_with_underscores()); + println!("Links: {}", links.len().separate_with_underscores()); + data::write_to_file(data, &pages, &links, &graph)?; + + Ok(()) + } } diff --git a/brood/src/data.rs b/brood/src/data.rs index 16aa0eb..69fc362 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,3 +1,177 @@ -pub mod adjacency_list; -pub mod info; -pub mod store; +use std::{ + fs::File, + io::{self, BufReader, BufWriter, Read, Write}, + path::Path, +}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +#[derive(Debug, Clone)] +pub struct Page { + pub id: u32, + pub title: String, + pub length: u32, + pub redirect: bool, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct Link { + pub start: u32, + pub len: u32, + pub flags: u8, +} + +impl Link { + pub fn in_parens(self) -> bool { + self.flags & 0b1 != 0 + } + + pub fn in_structure(self) -> bool { + self.flags & 0b10 != 0 + } +} + +struct Store<'a, W>(&'a mut W); + +fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u8(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 1]; + r.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) +} + +fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u16(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 2]; + r.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) +} + +fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u32(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 4]; + r.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(w, s.len() as u16)?; + w.write_all(s.as_bytes())?; + Ok(()) +} + +fn read_str(r: &mut impl Read) -> io::Result { + let len = read_u16(r)? as usize; + let mut buf = vec![0_u8; len]; + r.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) +} + +fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> { + write_u32(w, page.id)?; + write_u32(w, page.length)?; + write_u8(w, if page.redirect { 1 } else { 0 })?; + write_str(w, &page.title)?; + Ok(()) +} + +pub fn read_page(r: &mut impl Read) -> io::Result { + Ok(Page { + id: read_u32(r)?, + length: read_u32(r)?, + redirect: read_u8(r)? != 0, + title: read_str(r)?, + }) +} + +fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> { + write_u32(w, link.start)?; + write_u32(w, link.len)?; + write_u8(w, link.flags)?; + Ok(()) +} + +fn read_link(r: &mut impl Read) -> io::Result { + Ok(Link { + start: read_u32(r)?, + len: read_u32(r)?, + flags: read_u8(r)?, + }) +} + +fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { + assert!(pages.len() < u32::MAX as usize); + assert!(links.len() < u32::MAX as usize); + assert_eq!(pages.len(), graph.nodes.len()); + assert_eq!(links.len(), graph.edges.len()); + write_u32(w, pages.len() as u32)?; + write_u32(w, links.len() as u32)?; + + for page in pages { + write_page(w, page)?; + } + + for link in links { + write_link(w, link)?; + } + + for node in &graph.nodes { + write_u32(w, node.0)?; + } + + for edge in &graph.edges { + write_u32(w, edge.0)?; + } + + Ok(()) +} + +fn read(r: &mut impl Read) -> io::Result<(Vec, Vec, Graph)> { + let n_pages = read_u32(r)?; + let n_links = read_u32(r)?; + + let mut pages = Vec::with_capacity(n_pages as usize); + let mut links = Vec::with_capacity(n_links as usize); + let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + pages.push(read_page(r)?); + } + + for _ in 0..n_links { + links.push(read_link(r)?); + } + + for _ in 0..n_pages { + graph.nodes.push(EdgeIdx(read_u32(r)?)); + } + + for _ in 0..n_links { + graph.edges.push(NodeIdx(read_u32(r)?)); + } + + assert_eq!(pages.len(), graph.nodes.len()); + assert_eq!(links.len(), graph.edges.len()); + graph.check_consistency(); + Ok((pages, links, graph)) +} + +pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { + let mut file = BufWriter::new(File::create(path)?); + write(&mut file, pages, links, graph) +} + +pub fn read_from_file(path: &Path) -> io::Result<(Vec, Vec, Graph)> { + let mut file = BufReader::new(File::open(path)?); + read(&mut file) +} diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 9cd39d4..1cc25da 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -196,6 +196,10 @@ impl Graph { } } + pub fn add_node(&mut self) { + self.nodes.push(EdgeIdx::new(self.edges.len())); + } + pub fn check_consistency(&self) { if self.nodes.is_empty() { assert!(self.edges.is_empty(), "edges must belong to existing nodes"); diff --git a/brood/src/main.rs b/brood/src/main.rs index 3b93e2e..45bff55 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,62 +1,16 @@ mod algo; -pub mod commands; +mod commands; mod data; mod graph; mod util; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::{Path, PathBuf}; -use std::time::Instant; +use std::{io, path::PathBuf}; use clap::Parser; -use data::store; - -#[derive(Debug, PartialEq, Eq, Parser)] -pub enum PhilosophyGameCmd { - First, - Canonical, - Cluster, - Trace { start: String }, -} #[derive(Debug, Parser)] enum Command { - /// Read sift data on stdin and output brood data. - Ingest, - /// Read and reexport brood data. - Reexport { - to: PathBuf, - #[arg(long, short = 'P')] - in_parens: Option, - #[arg(long, short = 'S')] - in_structure: Option, - }, - /// Find a path from one article to another. - Path { - from: String, - to: String, - /// Flip start and end article. - #[arg(short, long)] - flip: bool, - }, - /// Find the longest shortest path starting at an article. - LongestShortestPath { - from: String, - }, - /// Analyze articles using "Philosophy Game" rules. - PhilosophyGame { - #[command(subcommand)] - subcmd: PhilosophyGameCmd, - }, - /// Print all page titles. - ListPages, - /// Print all links. - ListLinks { - /// The page to inspect. - page: String, - }, - Test, + Ingest(commands::ingest::Cmd), } #[derive(Debug, Parser)] @@ -69,42 +23,6 @@ struct Args { fn main() -> io::Result<()> { let args = Args::parse(); match args.command { - Command::Ingest => commands::ingest::ingest(&args.datafile), - Command::Reexport { - to, - in_parens, - in_structure, - } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure), - Command::Path { from, to, flip } => { - if flip { - commands::path::path(&args.datafile, &to, &from) - } else { - commands::path::path(&args.datafile, &from, &to) - } - } - Command::LongestShortestPath { from } => { - commands::longest_shortest_path::run(&args.datafile, &from) - } - Command::PhilosophyGame { subcmd } => { - commands::philosophy_game::run(&args.datafile, subcmd) - } - Command::ListPages => commands::list_pages::run(&args.datafile), - Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page), - Command::Test => test(&args.datafile), + Command::Ingest(cmd) => cmd.run(&args.datafile), } } - -fn test(datafile: &Path) -> io::Result<()> { - let a = Instant::now(); - // println!(">> Import adjacency list"); - // let mut databuf = BufReader::new(File::open(datafile)?); - // let adjlist = store::read_adjacency_list(&mut databuf)?; - println!(">> Import graph"); - let mut databuf = BufReader::new(File::open(datafile)?); - let (pages, links, graph) = store::read_graph(&mut databuf)?; - let b = Instant::now(); - - println!("{:?}", b.duration_since(a)); - - Ok(()) -} diff --git a/brood/src/util.rs b/brood/src/util.rs index e1a64ff..1cc1ab8 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,20 +1,151 @@ -use crate::data::{ - adjacency_list::{AdjacencyList, Page}, - info::{LinkInfo, PageInfo}, -}; +use std::{fmt, iter, time::Instant}; -pub fn normalize_link(link: &str) -> String { - let link = link.trim().replace(' ', "_"); +use regex::Regex; +use thousands::Separable; - // Make only first char lowercase - link.chars() - .next() - .iter() - .flat_map(|c| c.to_lowercase()) - .chain(link.chars().skip(1)) - .collect::() +pub struct Counter { + n: usize, + last_print: Instant, } +impl Counter { + pub fn new() -> Self { + Self { + n: 0, + last_print: Instant::now(), + } + } + + pub fn tick(&mut self) { + self.n += 1; + if self.n % 10_000 != 0 { + return; + } + + let now = Instant::now(); + if now.duration_since(self.last_print).as_secs() < 4 { + return; + } + + println!("{:>12}", self.n.separate_with_underscores()); + self.last_print = now; + } + + pub fn done(&self) { + println!("{:>12} (done)", self.n.separate_with_underscores()); + } +} + +// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js +struct PhpCharToUpper(char); + +impl fmt::Display for PhpCharToUpper { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.0 { + // Do something special, I guess + 'ᾀ' => write!(f, "ᾈ"), + 'ᾁ' => write!(f, "ᾉ"), + 'ᾂ' => write!(f, "ᾊ"), + 'ᾃ' => write!(f, "ᾋ"), + 'ᾄ' => write!(f, "ᾌ"), + 'ᾅ' => write!(f, "ᾍ"), + 'ᾆ' => write!(f, "ᾎ"), + 'ᾇ' => write!(f, "ᾏ"), + 'ᾐ' => write!(f, "ᾘ"), + 'ᾑ' => write!(f, "ᾙ"), + 'ᾒ' => write!(f, "ᾚ"), + 'ᾓ' => write!(f, "ᾛ"), + 'ᾔ' => write!(f, "ᾜ"), + 'ᾕ' => write!(f, "ᾝ"), + 'ᾖ' => write!(f, "ᾞ"), + 'ᾗ' => write!(f, "ᾟ"), + 'ᾠ' => write!(f, "ᾨ"), + 'ᾡ' => write!(f, "ᾩ"), + 'ᾢ' => write!(f, "ᾪ"), + 'ᾣ' => write!(f, "ᾫ"), + 'ᾤ' => write!(f, "ᾬ"), + 'ᾥ' => write!(f, "ᾭ"), + 'ᾦ' => write!(f, "ᾮ"), + 'ᾧ' => write!(f, "ᾯ"), + 'ᾳ' => write!(f, "ᾼ"), + 'ῃ' => write!(f, "ῌ"), + 'ῳ' => write!(f, "ῼ"), + + // Do not capitalize + 'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ' + | 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ' + | 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ' + | 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ' + | 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ' + | 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ' + | 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ' + | 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ' + | 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ' + | 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ' + | 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ' + | 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ' + | 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩' + | '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷' + | '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => { + write!(f, "{}", self.0) + } + + // Capitalize normally + c => write!(f, "{}", c.to_uppercase()), + } + } +} + +pub struct TitleNormalizer { + strip_bidi: Regex, + clean_up_whitespace: Regex, + trim_underscore_start: Regex, + trim_underscore_end: Regex, +} + +impl TitleNormalizer { + pub fn new() -> Self { + Self { + strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(), + + clean_up_whitespace: Regex::new(concat!( + "[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}", + "\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+" + )) + .unwrap(), + + trim_underscore_start: Regex::new("^_+").unwrap(), + + trim_underscore_end: Regex::new("_+$").unwrap(), + } + } + + /// Normalize an article title. + /// + /// See also . + pub fn normalize(&self, title: &str) -> String { + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403 + + // Strip Unicode bidi override characters + let title = self.strip_bidi.replace_all(title, ""); + + // Clean up whitespace + let title = self.clean_up_whitespace.replace_all(&title, "_"); + + // Trim _ from beginning and end + let title = self.trim_underscore_start.replace_all(&title, ""); + let title = self.trim_underscore_end.replace_all(&title, ""); + + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206 + let Some(first) = title.chars().next() else { + return String::new(); + }; + let rest = &title[first.len_utf8()..]; + format!("{}{rest}", PhpCharToUpper(first)) + } +} + +/* pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { let title = normalize_link(title); pages @@ -37,3 +168,4 @@ pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: return page_idx; } } +*/