From 7af2a4e06f00c6b881a90db7a6f0a60ab2838550 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 29 Dec 2024 20:48:52 +0100 Subject: [PATCH 01/36] Print nicer sift stats --- sift/sift.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sift/sift.py b/sift/sift.py index bde2e74..2562fa2 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -172,16 +172,21 @@ def process_xmldump_page(page): # Page info as simple tuples def simple_pages(input): dump = mwxml.Dump.from_file(sys.stdin) + articles = 0 for i, page in enumerate(dump.pages): + if (i + 1) % 1000 == 0: + # Yeah, the articles are usually off by one + eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}") + if page.namespace != 0: continue - if (i + 1) % 1000 == 0: - eprint(f"{i+1:8} pages, at pid {page.id:8}") - + articles += 1 [revision] = list(page) # Every page has exactly one revision yield page.id, page.title, revision.text or "", page.redirect + eprint(f"{articles} articles total") + def process_simple_page(info): pid, title, text, redirect = info From c2c1b1234ce9532e23021caa48706b21830aac85 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 29 Dec 2024 23:22:02 +0100 Subject: [PATCH 02/36] Change link data with page info --- brood/src/data/adjacency_list.rs | 52 +++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs index 04a1124..2de1631 100644 --- a/brood/src/data/adjacency_list.rs +++ b/brood/src/data/adjacency_list.rs @@ -10,7 +10,7 @@ pub struct Page

{ } impl

Page

{ - pub fn change_data(self, f: impl Fn(P) -> P2) -> Page { + pub fn change_data(self, f: &impl Fn(P) -> P2) -> Page { Page { start: self.start, data: f(self.data), @@ -26,12 +26,19 @@ pub struct Link { } impl Link { - pub fn change_data(self, f: impl Fn(L) -> L2) -> Link { + pub fn change_data(self, f: &impl Fn(L) -> L2) -> Link { Link { to: self.to, data: f(self.data), } } + + pub fn change_data_with_page(self, page: &P, f: &impl Fn(&P, L) -> L2) -> Link { + Link { + to: self.to, + data: f(page, self.data), + } + } } pub struct AdjacencyList { @@ -42,8 +49,8 @@ pub struct AdjacencyList { impl Default for AdjacencyList { fn default() -> Self { Self { - pages: Default::default(), - links: Default::default(), + pages: vec![], + links: vec![], } } } @@ -102,11 +109,11 @@ impl AdjacencyList { self.link_range(page_idx).map(|i| (i, self.link(i))) } - pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { + pub fn change_page_data(self, page_f: impl Fn(P) -> P2) -> AdjacencyList { let pages = self .pages .into_iter() - .map(|p| p.change_data(page_f)) + .map(|p| p.change_data(&page_f)) .collect::>(); AdjacencyList { @@ -115,11 +122,11 @@ impl AdjacencyList { } } - pub fn change_link_data(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList { + pub fn change_link_data(self, link_f: impl Fn(L) -> L2) -> AdjacencyList { let links = self .links .into_iter() - .map(|l| l.change_data(link_f)) + .map(|l| l.change_data(&link_f)) .collect::>(); AdjacencyList { @@ -127,6 +134,35 @@ impl AdjacencyList { links, } } + + pub fn change_link_data_with_page( + self, + link_f: impl Fn(&P, L) -> L2, + ) -> AdjacencyList { + let mut pages = self.pages.iter().peekable(); + let Some(mut cur_page) = pages.next() else { + // The list is empty, nothing to do + return AdjacencyList::default(); + }; + + let mut links = vec![]; + + for (i, link) in self.links.into_iter().enumerate() { + if let Some(page) = pages.peek() { + if i >= page.start as usize { + cur_page = page; + pages.next(); + } + } + + links.push(link.change_data_with_page(&cur_page.data, &link_f)); + } + + AdjacencyList { + pages: self.pages, + links, + } + } } impl AdjacencyList { From f5f4f99a2fa99216e9e813fcccaaadfb86636737 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 29 Dec 2024 23:28:55 +0100 Subject: [PATCH 03/36] Update dependencies --- brood/Cargo.lock | 129 +++++++++++++++++++++++++---------------------- brood/Cargo.toml | 8 +-- 2 files changed, 72 insertions(+), 65 deletions(-) diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 0162043..813574a 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -1,12 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "anstream" -version = "0.6.14" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", @@ -19,33 +19,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.3" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", "windows-sys", @@ -63,9 +63,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.7" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -73,9 +73,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.7" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -85,9 +85,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.5" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ "heck", "proc-macro2", @@ -97,15 +97,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "heck" @@ -115,39 +115,45 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "is_terminal_polyfill" -version = "1.70.0" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "ryu" @@ -157,18 +163,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", @@ -177,11 +183,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.118" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -194,9 +201,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.68" +version = "2.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" dependencies = [ "proc-macro2", "quote", @@ -205,9 +212,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "utf8parse" @@ -217,18 +224,18 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "windows-sys" -version = "0.52.0" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -242,48 +249,48 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 940f920..f53334d 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -4,7 +4,7 @@ version = "0.0.0" edition = "2021" [dependencies] -clap = { version = "4.5.7", features = ["derive", "deprecated"] } -rustc-hash = "2.0.0" -serde = { version = "1.0.203", features = ["derive"] } -serde_json = "1.0.118" +clap = { version = "4.5.23", features = ["derive", "deprecated"] } +rustc-hash = "2.1.0" +serde = { version = "1.0.217", features = ["derive"] } +serde_json = "1.0.134" From 49665f74ce762ea475824f0423f9716e109160e4 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 13:12:14 +0100 Subject: [PATCH 04/36] List links to and from an article --- brood/src/commands.rs | 1 + brood/src/commands/list_links.rs | 87 ++++++++++++++++++++++++++++++++ brood/src/main.rs | 6 +++ 3 files changed, 94 insertions(+) create mode 100644 brood/src/commands/list_links.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index ffff9d3..6da3050 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,4 +1,5 @@ pub mod ingest; +pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; diff --git a/brood/src/commands/list_links.rs b/brood/src/commands/list_links.rs new file mode 100644 index 0000000..37c9972 --- /dev/null +++ b/brood/src/commands/list_links.rs @@ -0,0 +1,87 @@ +use std::{ + collections::HashSet, + fs::File, + io::{self, BufReader}, + path::Path, +}; + +use crate::{ + data::{ + adjacency_list::AdjacencyList, + info::{LinkInfo, PageInfo}, + store, + }, + util, +}; + +fn links_from(data: &AdjacencyList, idx: u32) -> HashSet { + data.links(idx).map(|(_, ld)| ld.to).collect() +} + +fn links_to(data: &AdjacencyList, idx: u32) -> HashSet { + let mut links = HashSet::::new(); + for (pi, _) in data.pages() { + for (_, ld) in data.links(pi) { + if ld.to == idx { + links.insert(pi); + continue; + } + } + } + links +} + +fn print_links(data: &AdjacencyList, name: &str, links: &HashSet) { + let mut links = links + .iter() + .map(|pi| { + let page = data.page(*pi); + (&page.data.title as &str, page.data.redirect) + }) + .collect::>(); + + links.sort(); + + println!(">> {name} ({}):", links.len()); + for (title, redirect) in links { + if redirect { + println!("v {title}"); + } else { + println!("- {title}"); + } + } +} + +pub fn run(datafile: &Path, page: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let data = store::read_adjacency_list(&mut databuf)?; + + println!(">> Locate page"); + let idx = util::resolve_redirects(&data, util::find_index_of_title(&data.pages, page)); + println!("Page: {:?}", data.page(idx).data.title); + + println!(">> Find links"); + let from = links_from(&data, idx); + let to = links_to(&data, idx); + let twins = from.intersection(&to).copied().collect::>(); + let twinless_from = from.difference(&twins).copied().collect::>(); + let twinless_to = to.difference(&twins).copied().collect::>(); + + println!(); + print_links(&data, "From", &from); + + println!(); + print_links(&data, "To", &to); + + println!(); + print_links(&data, "Twins", &twins); + + println!(); + print_links(&data, "From without twins", &twinless_from); + + println!(); + print_links(&data, "To without twins", &twinless_to); + + Ok(()) +} diff --git a/brood/src/main.rs b/brood/src/main.rs index e4b4074..57d1b81 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -44,6 +44,11 @@ enum Command { }, /// Print all page titles. ListPages, + /// Print all links. + ListLinks { + /// The page to inspect. + page: String, + }, } #[derive(Debug, Parser)] @@ -76,5 +81,6 @@ fn main() -> io::Result<()> { commands::philosophy_game::run(&args.datafile, subcmd) } Command::ListPages => commands::list_pages::run(&args.datafile), + Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page), } } From e3e191b748ad039a4939036cd9e2e92b49f99863 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 13:12:25 +0100 Subject: [PATCH 05/36] Improve philosophy game trace output --- brood/src/commands/philosophy_game.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/brood/src/commands/philosophy_game.rs b/brood/src/commands/philosophy_game.rs index 178df1d..b276bd5 100644 --- a/brood/src/commands/philosophy_game.rs +++ b/brood/src/commands/philosophy_game.rs @@ -161,12 +161,14 @@ fn print_trace(data: &AdjacencyList, forward: &PageMap, star let next = forward.get(current); if next == u32::MAX { - println!("dead-end reached"); + println!("> dead-end reached"); return; } if visited.contains(&next) { - println!("loop detected"); + let page = data.page(next); + let title = &page.data.title; + println!("> loop detected ({title})"); return; } From 34df6c9f14665f814c2030bb29cb59df178bf1e6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 13:12:49 +0100 Subject: [PATCH 06/36] Try out petgraph --- brood/Cargo.lock | 39 +++++++++++++++ brood/Cargo.toml | 1 + brood/src/commands.rs | 1 + brood/src/commands/path_petgraph.rs | 77 +++++++++++++++++++++++++++++ brood/src/data/store.rs | 33 +++++++++++++ brood/src/main.rs | 15 ++++++ 6 files changed, 166 insertions(+) create mode 100644 brood/src/commands/path_petgraph.rs diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 813574a..ff57248 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -56,6 +56,7 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", + "petgraph", "rustc-hash", "serde", "serde_json", @@ -107,12 +108,40 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -131,6 +160,16 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "proc-macro2" version = "1.0.92" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index f53334d..97393e9 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } +petgraph = "0.6.5" rustc-hash = "2.1.0" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6da3050..a5b0156 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -3,5 +3,6 @@ pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; +pub mod path_petgraph; pub mod philosophy_game; pub mod reexport; diff --git a/brood/src/commands/path_petgraph.rs b/brood/src/commands/path_petgraph.rs new file mode 100644 index 0000000..02262e8 --- /dev/null +++ b/brood/src/commands/path_petgraph.rs @@ -0,0 +1,77 @@ +use std::{ + fs::File, + io::{self, BufReader}, + path::Path, +}; + +use petgraph::{ + algo, + graph::NodeIndex, + visit::{EdgeRef, IntoNodeReferences}, + Graph, +}; + +use crate::{ + data::{ + info::{LinkInfo, PageInfo}, + store, + }, + util::{self, normalize_link}, +}; + +pub fn find_index_of_title(graph: &Graph, title: &str) -> NodeIndex { + let title = util::normalize_link(title); + graph + .node_references() + .find(|(_, nw)| normalize_link(&nw.title) == title) + .map(|(ni, _)| ni) + .expect("invalid title") +} + +pub fn resolve_redirects(graph: &Graph, mut page: NodeIndex) -> NodeIndex { + loop { + if graph.node_weight(page).unwrap().redirect { + if let Some(link) = graph.edges(page).next() { + page = link.target(); + continue; + } + } + return page; + } +} + +pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let graph = store::read_petgraph(&mut databuf)?; + + println!(">> Locate from and to"); + let start = resolve_redirects(&graph, find_index_of_title(&graph, from)); + let goal = resolve_redirects(&graph, find_index_of_title(&graph, to)); + println!("From: {:?}", graph.node_weight(start).unwrap().title); + println!("To: {:?}", graph.node_weight(goal).unwrap().title); + + println!(">> Find path"); + let Some((cost, path)) = algo::astar( + &graph, + start, + |n| n == goal, + |e| !graph.node_weight(e.source()).unwrap().redirect as u32, + |_| 0, + ) else { + println!("No path found"); + return Ok(()); + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + for page in path { + let page = graph.node_weight(page).unwrap(); + if page.redirect { + println!(" v {:?}", page.title); + } else { + println!(" - {:?}", page.title); + } + } + + Ok(()) +} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs index afba1a3..118e8a1 100644 --- a/brood/src/data/store.rs +++ b/brood/src/data/store.rs @@ -1,5 +1,7 @@ use std::io::{self, Read, Write}; +use petgraph::{graph::NodeIndex, Directed, Graph}; + use super::{ adjacency_list::{AdjacencyList, Link, Page}, info::{LinkInfo, PageInfo}, @@ -132,3 +134,34 @@ pub fn read_adjacency_list(from: &mut R) -> io::Result(from: &mut R) -> io::Result> { + let n_pages = read_u32(from)?; + let n_links = read_u32(from)?; + + let mut graph = Graph::<_, _, Directed, _>::with_capacity(n_pages as usize, n_links as usize); + let mut page_starts = Vec::with_capacity(n_pages as usize); + + for _ in 0..n_pages { + let page = read_page(from)?; + page_starts.push(page.start); + graph.add_node(page.data); + } + + let mut ni = 0; + for ei in 0..n_links { + while ei >= page_starts.get(ni).copied().unwrap_or(u32::MAX) { + ni += 1; + } + ni -= 1; + + let link = read_link(from)?; + graph.add_edge( + NodeIndex::new(ni), + NodeIndex::new(link.to as usize), + link.data, + ); + } + + Ok(graph) +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 57d1b81..774f6b0 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -35,6 +35,14 @@ enum Command { #[arg(short, long)] flip: bool, }, + /// Find a path from one article to another. + PathPetgraph { + from: String, + to: String, + /// Flip start and end article. + #[arg(short, long)] + flip: bool, + }, /// Find the longest shortest path starting at an article. LongestShortestPath { from: String }, /// Analyze articles using "Philosophy Game" rules. @@ -74,6 +82,13 @@ fn main() -> io::Result<()> { commands::path::path(&args.datafile, &from, &to) } } + Command::PathPetgraph { from, to, flip } => { + if flip { + commands::path_petgraph::path(&args.datafile, &to, &from) + } else { + commands::path_petgraph::path(&args.datafile, &from, &to) + } + } Command::LongestShortestPath { from } => { commands::longest_shortest_path::run(&args.datafile, &from) } From 778cb6748d5f4fe3799d41b48a07fb0de2defae5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 16:00:09 +0100 Subject: [PATCH 07/36] Remove petgraph --- brood/Cargo.lock | 39 --------------- brood/Cargo.toml | 1 - brood/src/commands.rs | 1 - brood/src/commands/path_petgraph.rs | 77 ----------------------------- brood/src/data/store.rs | 33 ------------- brood/src/main.rs | 15 ------ 6 files changed, 166 deletions(-) delete mode 100644 brood/src/commands/path_petgraph.rs diff --git a/brood/Cargo.lock b/brood/Cargo.lock index ff57248..813574a 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -56,7 +56,6 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", - "petgraph", "rustc-hash", "serde", "serde_json", @@ -108,40 +107,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - -[[package]] -name = "hashbrown" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" - [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "indexmap" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" -dependencies = [ - "equivalent", - "hashbrown", -] - [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -160,16 +131,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset", - "indexmap", -] - [[package]] name = "proc-macro2" version = "1.0.92" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 97393e9..f53334d 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,7 +5,6 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } -petgraph = "0.6.5" rustc-hash = "2.1.0" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index a5b0156..6da3050 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -3,6 +3,5 @@ pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; -pub mod path_petgraph; pub mod philosophy_game; pub mod reexport; diff --git a/brood/src/commands/path_petgraph.rs b/brood/src/commands/path_petgraph.rs deleted file mode 100644 index 02262e8..0000000 --- a/brood/src/commands/path_petgraph.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::{ - fs::File, - io::{self, BufReader}, - path::Path, -}; - -use petgraph::{ - algo, - graph::NodeIndex, - visit::{EdgeRef, IntoNodeReferences}, - Graph, -}; - -use crate::{ - data::{ - info::{LinkInfo, PageInfo}, - store, - }, - util::{self, normalize_link}, -}; - -pub fn find_index_of_title(graph: &Graph, title: &str) -> NodeIndex { - let title = util::normalize_link(title); - graph - .node_references() - .find(|(_, nw)| normalize_link(&nw.title) == title) - .map(|(ni, _)| ni) - .expect("invalid title") -} - -pub fn resolve_redirects(graph: &Graph, mut page: NodeIndex) -> NodeIndex { - loop { - if graph.node_weight(page).unwrap().redirect { - if let Some(link) = graph.edges(page).next() { - page = link.target(); - continue; - } - } - return page; - } -} - -pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let graph = store::read_petgraph(&mut databuf)?; - - println!(">> Locate from and to"); - let start = resolve_redirects(&graph, find_index_of_title(&graph, from)); - let goal = resolve_redirects(&graph, find_index_of_title(&graph, to)); - println!("From: {:?}", graph.node_weight(start).unwrap().title); - println!("To: {:?}", graph.node_weight(goal).unwrap().title); - - println!(">> Find path"); - let Some((cost, path)) = algo::astar( - &graph, - start, - |n| n == goal, - |e| !graph.node_weight(e.source()).unwrap().redirect as u32, - |_| 0, - ) else { - println!("No path found"); - return Ok(()); - }; - - println!("Path found (cost {cost}, length {}):", path.len()); - for page in path { - let page = graph.node_weight(page).unwrap(); - if page.redirect { - println!(" v {:?}", page.title); - } else { - println!(" - {:?}", page.title); - } - } - - Ok(()) -} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs index 118e8a1..afba1a3 100644 --- a/brood/src/data/store.rs +++ b/brood/src/data/store.rs @@ -1,7 +1,5 @@ use std::io::{self, Read, Write}; -use petgraph::{graph::NodeIndex, Directed, Graph}; - use super::{ adjacency_list::{AdjacencyList, Link, Page}, info::{LinkInfo, PageInfo}, @@ -134,34 +132,3 @@ pub fn read_adjacency_list(from: &mut R) -> io::Result(from: &mut R) -> io::Result> { - let n_pages = read_u32(from)?; - let n_links = read_u32(from)?; - - let mut graph = Graph::<_, _, Directed, _>::with_capacity(n_pages as usize, n_links as usize); - let mut page_starts = Vec::with_capacity(n_pages as usize); - - for _ in 0..n_pages { - let page = read_page(from)?; - page_starts.push(page.start); - graph.add_node(page.data); - } - - let mut ni = 0; - for ei in 0..n_links { - while ei >= page_starts.get(ni).copied().unwrap_or(u32::MAX) { - ni += 1; - } - ni -= 1; - - let link = read_link(from)?; - graph.add_edge( - NodeIndex::new(ni), - NodeIndex::new(link.to as usize), - link.data, - ); - } - - Ok(graph) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 774f6b0..57d1b81 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -35,14 +35,6 @@ enum Command { #[arg(short, long)] flip: bool, }, - /// Find a path from one article to another. - PathPetgraph { - from: String, - to: String, - /// Flip start and end article. - #[arg(short, long)] - flip: bool, - }, /// Find the longest shortest path starting at an article. LongestShortestPath { from: String }, /// Analyze articles using "Philosophy Game" rules. @@ -82,13 +74,6 @@ fn main() -> io::Result<()> { commands::path::path(&args.datafile, &from, &to) } } - Command::PathPetgraph { from, to, flip } => { - if flip { - commands::path_petgraph::path(&args.datafile, &to, &from) - } else { - commands::path_petgraph::path(&args.datafile, &from, &to) - } - } Command::LongestShortestPath { from } => { commands::longest_shortest_path::run(&args.datafile, &from) } From 18e54c4ce1d83214ab3b3ee952e4efac0df97edb Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 18:19:20 +0100 Subject: [PATCH 08/36] Implement new graph data structure and dijkstra --- brood/src/algo.rs | 77 ++++++++++ brood/src/commands.rs | 1 + brood/src/commands/path2.rs | 77 ++++++++++ brood/src/data/store.rs | 26 ++++ brood/src/graph.rs | 293 ++++++++++++++++++++++++++++++++++++ brood/src/main.rs | 45 +++++- 6 files changed, 516 insertions(+), 3 deletions(-) create mode 100644 brood/src/algo.rs create mode 100644 brood/src/commands/path2.rs create mode 100644 brood/src/graph.rs diff --git a/brood/src/algo.rs b/brood/src/algo.rs new file mode 100644 index 0000000..b6bf26a --- /dev/null +++ b/brood/src/algo.rs @@ -0,0 +1,77 @@ +use std::{cmp::Reverse, collections::BinaryHeap}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +pub struct Dijkstra<'a> { + graph: &'a Graph, + cost: Vec, + pred: Vec, +} + +impl<'a> Dijkstra<'a> { + pub fn new(graph: &'a Graph) -> Self { + Self { + graph, + cost: vec![u32::MAX; graph.nodes.len()], + pred: vec![NodeIdx::NONE; graph.nodes.len()], + } + } + + pub fn run( + &mut self, + start: NodeIdx, + goal: impl Fn(NodeIdx) -> bool, + cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, + ) { + self.cost[start.usize()] = 0; + let mut queue = BinaryHeap::new(); + queue.push((Reverse(0), start)); + + while let Some((Reverse(curr_cost), curr)) = queue.pop() { + if goal(curr) { + break; // We've found the shortest path to our target + } + + // These seem to never actually occur + // if curr_cost > self.cost[curr.usize()] { + // continue; // Outdated entry + // } + + for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { + let next = self.graph.edges[edge.usize()]; + let next_cost = curr_cost + cost(curr, edge, next); + if next_cost < self.cost[next.usize()] { + self.cost[next.usize()] = next_cost; + self.pred[next.usize()] = curr; + queue.push((Reverse(next_cost), next)); + } + } + } + } + + #[inline] + pub fn cost(&self, node: NodeIdx) -> u32 { + self.cost[node.usize()] + } + + #[inline] + pub fn pred(&self, node: NodeIdx) -> NodeIdx { + self.pred[node.usize()] + } + + pub fn path(&self, goal: NodeIdx) -> Vec { + let mut path = vec![]; + let mut at = goal; + + loop { + path.push(at); + at = self.pred(at); + if at == NodeIdx::NONE { + break; + } + } + + path.reverse(); + path + } +} diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6da3050..d4b8155 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -3,5 +3,6 @@ pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; +pub mod path2; pub mod philosophy_game; pub mod reexport; diff --git a/brood/src/commands/path2.rs b/brood/src/commands/path2.rs new file mode 100644 index 0000000..55c72ed --- /dev/null +++ b/brood/src/commands/path2.rs @@ -0,0 +1,77 @@ +use std::{ + fs::File, + io::{self, BufReader}, + path::Path, +}; + +use crate::{ + algo::Dijkstra, + data::{info::PageInfo, store}, + graph::{Graph, NodeIdx}, + util, +}; + +pub fn find_index_of_title(pages: &[PageInfo], title: &str) -> NodeIdx { + let title = util::normalize_link(title); + pages + .iter() + .enumerate() + .find(|(_, p)| util::normalize_link(&p.title) == title) + .map(|(i, _)| NodeIdx::new(i)) + .expect("invalid title") +} + +pub fn resolve_redirects(pages: &[PageInfo], graph: &Graph, mut page: NodeIdx) -> NodeIdx { + loop { + if pages[page.usize()].redirect { + if let Some(next) = graph.edges_for(page).first() { + page = *next; + continue; + } + } + + return page; + } +} + +pub fn path(datafile: &Path, start: &str, goal: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let (pages, _links, graph) = store::read_graph(&mut databuf)?; + + println!(">> Locate from and to"); + let start = resolve_redirects(&pages, &graph, find_index_of_title(&pages, start)); + let goal = resolve_redirects(&pages, &graph, find_index_of_title(&pages, goal)); + println!("Start: {:?}", pages[start.usize()].title); + println!("Goal: {:?}", pages[goal.usize()].title); + + println!(">> Find path"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !pages[source.usize()].redirect as u32, + ); + + if dijkstra.cost(goal) == u32::MAX { + println!("No path found"); + return Ok(()); + } + + println!("> Collecting path"); + let path = dijkstra.path(goal); + let cost = dijkstra.cost(goal); + println!("Path found (cost {cost}, length {}):", path.len()); + for page in path { + let info = &pages[page.usize()]; + if info.redirect { + println!(" v {:?}", info.title); + } else { + println!(" - {:?}", info.title); + } + } + + Ok(()) +} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs index afba1a3..06a35eb 100644 --- a/brood/src/data/store.rs +++ b/brood/src/data/store.rs @@ -1,5 +1,7 @@ use std::io::{self, Read, Write}; +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + use super::{ adjacency_list::{AdjacencyList, Link, Page}, info::{LinkInfo, PageInfo}, @@ -132,3 +134,27 @@ pub fn read_adjacency_list(from: &mut R) -> io::Result io::Result<(Vec, Vec, Graph)> { + let n_pages = read_u32(from)?; + let n_links = read_u32(from)?; + + let mut pages = Vec::with_capacity(n_pages as usize); + let mut links = Vec::with_capacity(n_links as usize); + let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + let page = read_page(from)?; + graph.nodes.push(EdgeIdx(page.start)); + pages.push(page.data); + } + + for _ in 0..n_links { + let link = read_link(from)?; + graph.edges.push(NodeIdx(link.to)); + links.push(link.data); + } + + graph.check_consistency(); + Ok((pages, links, graph)) +} diff --git a/brood/src/graph.rs b/brood/src/graph.rs new file mode 100644 index 0000000..9cd39d4 --- /dev/null +++ b/brood/src/graph.rs @@ -0,0 +1,293 @@ +use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct NodeIdx(pub u32); + +impl NodeIdx { + pub const NONE: Self = Self(u32::MAX); + + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for NodeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for NodeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct EdgeIdx(pub u32); + +impl EdgeIdx { + pub const NONE: Self = Self(u32::MAX); + + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for EdgeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for EdgeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Default)] +pub struct Graph { + /// A node points to the first of its edges. + /// + /// A special case is that if the subsequent node points to the same edge, + /// the current node has no edges. + pub nodes: Vec, + + /// An edge points to a target node. + /// + /// The source node is defined implicitly by the graph data structure. + pub edges: Vec, +} + +impl Graph { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(nodes: usize, edges: usize) -> Self { + Self { + nodes: Vec::with_capacity(nodes), + edges: Vec::with_capacity(edges), + } + } + + pub fn check_consistency(&self) { + if self.nodes.is_empty() { + assert!(self.edges.is_empty(), "edges must belong to existing nodes"); + return; + } + + assert!(self.nodes.len() < u32::MAX as usize, "too many nodes"); + assert!(self.edges.len() < u32::MAX as usize, "too many edges"); + + assert_eq!( + *self.nodes.first().unwrap(), + EdgeIdx(0), + "first node pointer must be 0" + ); + + for (ni, node) in self.nodes.iter().cloned().enumerate() { + assert!( + node.usize() < self.edges.len(), + "node pointers must in range" + ); + + if let Some(succ) = self.nodes.get(ni + 1) { + assert!(node <= *succ, "node pointers must be well-ordered"); + } + } + + for edge in &self.edges { + assert!( + edge.usize() < self.nodes.len(), + "edge pointers must be in range" + ); + } + } + + pub fn nodes(&self) -> impl Iterator + '_ { + (0..self.nodes.len()).map(NodeIdx::new) + } + + pub fn edges(&self) -> impl Iterator + '_ { + Edges::new(self) + } + + pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { + self.nodes + .get(node.usize()) + .copied() + .unwrap_or_else(|| self.edges.len().into()) + } + + pub fn edge_range(&self, node: NodeIdx) -> Range { + let start = self.nodes[node.usize()]; + let end = self.edge_start(node + 1); + start.usize()..end.usize() + } + + pub fn edges_for(&self, node: NodeIdx) -> &[NodeIdx] { + &self.edges[self.edge_range(node)] + } +} + +struct Edges<'a> { + graph: &'a Graph, + ni: NodeIdx, + ei: EdgeIdx, +} + +impl<'a> Edges<'a> { + fn new(graph: &'a Graph) -> Self { + Self { + graph, + ni: NodeIdx(0), + ei: EdgeIdx(0), + } + } +} + +impl Iterator for Edges<'_> { + type Item = (NodeIdx, NodeIdx); + + fn next(&mut self) -> Option { + if self.ei.usize() >= self.graph.edges.len() { + return None; + } + let to = self.graph.edges[self.ei.usize()]; + + // if would not be sufficient because some nodes may not have any edges. + while self.ei >= self.graph.edge_start(self.ni + 1) { + self.ni += 1; + } + let from = self.ni; + + self.ei += 1; + Some((from, to)) + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 57d1b81..501540b 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,11 +1,16 @@ +mod algo; pub mod commands; mod data; +mod graph; mod util; -use std::io; -use std::path::PathBuf; +use std::fs::File; +use std::io::{self, BufReader}; +use std::path::{Path, PathBuf}; +use std::time::Instant; use clap::Parser; +use data::store; #[derive(Debug, PartialEq, Eq, Parser)] pub enum PhilosophyGameCmd { @@ -35,8 +40,18 @@ enum Command { #[arg(short, long)] flip: bool, }, + /// Find a path from one article to another. + Path2 { + from: String, + to: String, + /// Flip start and end article. + #[arg(short, long)] + flip: bool, + }, /// Find the longest shortest path starting at an article. - LongestShortestPath { from: String }, + LongestShortestPath { + from: String, + }, /// Analyze articles using "Philosophy Game" rules. PhilosophyGame { #[command(subcommand)] @@ -49,6 +64,7 @@ enum Command { /// The page to inspect. page: String, }, + Test, } #[derive(Debug, Parser)] @@ -74,6 +90,13 @@ fn main() -> io::Result<()> { commands::path::path(&args.datafile, &from, &to) } } + Command::Path2 { from, to, flip } => { + if flip { + commands::path2::path(&args.datafile, &to, &from) + } else { + commands::path2::path(&args.datafile, &from, &to) + } + } Command::LongestShortestPath { from } => { commands::longest_shortest_path::run(&args.datafile, &from) } @@ -82,5 +105,21 @@ fn main() -> io::Result<()> { } Command::ListPages => commands::list_pages::run(&args.datafile), Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page), + Command::Test => test(&args.datafile), } } + +fn test(datafile: &Path) -> io::Result<()> { + let a = Instant::now(); + // println!(">> Import adjacency list"); + // let mut databuf = BufReader::new(File::open(datafile)?); + // let adjlist = store::read_adjacency_list(&mut databuf)?; + println!(">> Import graph"); + let mut databuf = BufReader::new(File::open(datafile)?); + let (pages, links, graph) = store::read_graph(&mut databuf)?; + let b = Instant::now(); + + println!("{:?}", b.duration_since(a)); + + Ok(()) +} From f819f5bf698830ac10bfd720f19bc932587a614e Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 30 Dec 2024 19:34:33 +0100 Subject: [PATCH 09/36] Remove adjlist-based path implementation --- brood/src/commands.rs | 1 - brood/src/commands/path.rs | 192 +++++++++++------------------------- brood/src/commands/path2.rs | 77 --------------- brood/src/main.rs | 15 --- 4 files changed, 55 insertions(+), 230 deletions(-) delete mode 100644 brood/src/commands/path2.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index d4b8155..6da3050 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -3,6 +3,5 @@ pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; -pub mod path2; pub mod philosophy_game; pub mod reexport; diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 82079d2..55c72ed 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,158 +1,76 @@ -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; +use std::{ + fs::File, + io::{self, BufReader}, + path::Path, +}; -use crate::data::adjacency_list::AdjacencyList; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; +use crate::{ + algo::Dijkstra, + data::{info::PageInfo, store}, + graph::{Graph, NodeIdx}, + util, +}; -struct DijkstraPageInfo { - cost: u32, - prev: u32, - redirect: bool, +pub fn find_index_of_title(pages: &[PageInfo], title: &str) -> NodeIdx { + let title = util::normalize_link(title); + pages + .iter() + .enumerate() + .find(|(_, p)| util::normalize_link(&p.title) == title) + .map(|(i, _)| NodeIdx::new(i)) + .expect("invalid title") } -impl DijkstraPageInfo { - fn from_page_info(info: PageInfo) -> Self { - Self { - cost: u32::MAX, - prev: u32::MAX, - redirect: info.redirect, - } - } -} - -struct DijkstraLinkInfo { - cost: u32, -} - -impl DijkstraLinkInfo { - fn from_link_info(info: LinkInfo) -> Self { - Self { - cost: 1, - // cost: 1000 + info.start, - // cost: 10000 + info.start, - // cost: 1000 + info.start / 10, - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct Entry { - cost: u32, - page_idx: u32, -} - -impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } - } -} - -// Manual implementation so the queue is a min-heap instead of a max-heap. -impl Ord for Entry { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .cost - .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) - } -} - -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn dijkstra(data: AdjacencyList, from: u32, to: u32) -> Option> { - println!("> Prepare state"); - let mut data = data - .change_page_data(DijkstraPageInfo::from_page_info) - .change_link_data(DijkstraLinkInfo::from_link_info); - let mut queue = BinaryHeap::new(); - data.page_mut(from).data.cost = 0; - queue.push(Entry::new(0, from)); - - println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - if page_idx == to { - // We've found the shortest path to our target - break; - } - - let page = data.page(page_idx); - if cost > page.data.cost { - // This queue entry is outdated - continue; - } - - let redirect = page.data.redirect; - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - - let next = Entry { - cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, - }; - - let target_page = data.page_mut(link.to); - if next.cost < target_page.data.cost { - target_page.data.cost = next.cost; - target_page.data.prev = page_idx; - queue.push(next); +pub fn resolve_redirects(pages: &[PageInfo], graph: &Graph, mut page: NodeIdx) -> NodeIdx { + loop { + if pages[page.usize()].redirect { + if let Some(next) = graph.edges_for(page).first() { + page = *next; + continue; } } - } - println!("> Collect results"); - let mut steps = vec![]; - let mut at = to; - loop { - steps.push(at); - at = data.page(at).data.prev; - if at == u32::MAX { - break; - }; - } - steps.reverse(); - if steps.first() == Some(&from) { - Some(steps) - } else { - None + return page; } } -pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { +pub fn path(datafile: &Path, start: &str, goal: &str) -> io::Result<()> { println!(">> Import"); let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - let pages = data.pages.clone(); + let (pages, _links, graph) = store::read_graph(&mut databuf)?; println!(">> Locate from and to"); - let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); - let to_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, to)); - println!("From: {:?}", data.page(from_idx).data.title); - println!("To: {:?}", data.page(to_idx).data.title); + let start = resolve_redirects(&pages, &graph, find_index_of_title(&pages, start)); + let goal = resolve_redirects(&pages, &graph, find_index_of_title(&pages, goal)); + println!("Start: {:?}", pages[start.usize()].title); + println!("Goal: {:?}", pages[goal.usize()].title); println!(">> Find path"); - let path = dijkstra(data, from_idx, to_idx); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !pages[source.usize()].redirect as u32, + ); - if let Some(path) = path { - println!("Path found:"); - for page_idx in path { - let page = &pages[page_idx as usize]; - if page.data.redirect { - println!(" v {:?}", page.data.title); - } else { - println!(" - {:?}", page.data.title); - } - } - } else { + if dijkstra.cost(goal) == u32::MAX { println!("No path found"); + return Ok(()); + } + + println!("> Collecting path"); + let path = dijkstra.path(goal); + let cost = dijkstra.cost(goal); + println!("Path found (cost {cost}, length {}):", path.len()); + for page in path { + let info = &pages[page.usize()]; + if info.redirect { + println!(" v {:?}", info.title); + } else { + println!(" - {:?}", info.title); + } } Ok(()) diff --git a/brood/src/commands/path2.rs b/brood/src/commands/path2.rs deleted file mode 100644 index 55c72ed..0000000 --- a/brood/src/commands/path2.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::{ - fs::File, - io::{self, BufReader}, - path::Path, -}; - -use crate::{ - algo::Dijkstra, - data::{info::PageInfo, store}, - graph::{Graph, NodeIdx}, - util, -}; - -pub fn find_index_of_title(pages: &[PageInfo], title: &str) -> NodeIdx { - let title = util::normalize_link(title); - pages - .iter() - .enumerate() - .find(|(_, p)| util::normalize_link(&p.title) == title) - .map(|(i, _)| NodeIdx::new(i)) - .expect("invalid title") -} - -pub fn resolve_redirects(pages: &[PageInfo], graph: &Graph, mut page: NodeIdx) -> NodeIdx { - loop { - if pages[page.usize()].redirect { - if let Some(next) = graph.edges_for(page).first() { - page = *next; - continue; - } - } - - return page; - } -} - -pub fn path(datafile: &Path, start: &str, goal: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let (pages, _links, graph) = store::read_graph(&mut databuf)?; - - println!(">> Locate from and to"); - let start = resolve_redirects(&pages, &graph, find_index_of_title(&pages, start)); - let goal = resolve_redirects(&pages, &graph, find_index_of_title(&pages, goal)); - println!("Start: {:?}", pages[start.usize()].title); - println!("Goal: {:?}", pages[goal.usize()].title); - - println!(">> Find path"); - println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&graph); - println!("> Running dijkstra"); - dijkstra.run( - start, - |node| node == goal, - |source, _edge, _target| !pages[source.usize()].redirect as u32, - ); - - if dijkstra.cost(goal) == u32::MAX { - println!("No path found"); - return Ok(()); - } - - println!("> Collecting path"); - let path = dijkstra.path(goal); - let cost = dijkstra.cost(goal); - println!("Path found (cost {cost}, length {}):", path.len()); - for page in path { - let info = &pages[page.usize()]; - if info.redirect { - println!(" v {:?}", info.title); - } else { - println!(" - {:?}", info.title); - } - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 501540b..3b93e2e 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -40,14 +40,6 @@ enum Command { #[arg(short, long)] flip: bool, }, - /// Find a path from one article to another. - Path2 { - from: String, - to: String, - /// Flip start and end article. - #[arg(short, long)] - flip: bool, - }, /// Find the longest shortest path starting at an article. LongestShortestPath { from: String, @@ -90,13 +82,6 @@ fn main() -> io::Result<()> { commands::path::path(&args.datafile, &from, &to) } } - Command::Path2 { from, to, flip } => { - if flip { - commands::path2::path(&args.datafile, &to, &from) - } else { - commands::path2::path(&args.datafile, &from, &to) - } - } Command::LongestShortestPath { from } => { commands::longest_shortest_path::run(&args.datafile, &from) } From 3aa8222b6b5e4002e30d403753b0837917ed7404 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 00:48:24 +0100 Subject: [PATCH 10/36] Rewrite ingest command --- brood/Cargo.lock | 46 +++++++ brood/Cargo.toml | 2 + brood/src/commands.rs | 6 - brood/src/commands/ingest.rs | 258 +++++++++++++++++------------------ brood/src/data.rs | 180 +++++++++++++++++++++++- brood/src/graph.rs | 4 + brood/src/main.rs | 90 +----------- brood/src/util.rs | 158 +++++++++++++++++++-- 8 files changed, 502 insertions(+), 242 deletions(-) diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 813574a..16c8cc8 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.18" @@ -56,9 +65,11 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", + "regex", "rustc-hash", "serde", "serde_json", + "thousands", ] [[package]] @@ -149,6 +160,35 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "rustc-hash" version = "2.1.0" @@ -210,6 +250,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + [[package]] name = "unicode-ident" version = "1.0.14" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index f53334d..a560f73 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,6 +5,8 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } +regex = "1.11.1" rustc-hash = "2.1.0" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" +thousands = "0.2.0" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6da3050..b3ac910 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,7 +1 @@ pub mod ingest; -pub mod list_links; -pub mod list_pages; -pub mod longest_shortest_path; -pub mod path; -pub mod philosophy_game; -pub mod reexport; diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index cda10d0..5407a8b 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,16 +1,18 @@ -use std::collections::hash_map::Entry; -use std::fs::File; -use std::io::{self, BufRead, BufReader, BufWriter}; -use std::path::Path; -use std::u32; +use std::{ + collections::{hash_map::Entry, HashMap}, + fs::File, + io::{self, BufRead, BufReader, Seek}, + path::{Path, PathBuf}, +}; -use rustc_hash::FxHashMap; use serde::Deserialize; +use thousands::Separable; -use crate::data::adjacency_list::{AdjacencyList, Page}; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; +use crate::{ + data::{self, Link, Page}, + graph::{Graph, NodeIdx}, + util::{Counter, TitleNormalizer}, +}; #[derive(Deserialize)] struct JsonPage { @@ -21,151 +23,139 @@ struct JsonPage { redirect: Option, } -/* -Importing is a tad complicated because of multiple criteria: +fn read_titles(r: &mut BufReader) -> io::Result> { + let mut counter = Counter::new(); + let mut titles = vec![]; -1. The data must be read in a single pass on stdin -2. The process should not consume a lot of memory - (can't store the decoded json data directly) -3. The process should result in a nice and compact adjacency list format - -Because of this, the import is a bit more complex and has two passes. - -The first pass imports the data into an adjacency-list-like format, but the -`Link::to` field points to a title in `Titles` instead of a page. - -The second pass then resolves the links to page indices and throws away all -links that don't point to any known page. -*/ - -#[derive(Default)] -struct Titles { - /// Normalized titles - titles: Vec, - /// Map from normalized title to index in [`Self::titles`]. - map: FxHashMap, -} - -impl Titles { - fn insert(&mut self, title: String) -> u32 { - match self.map.entry(title.clone()) { - Entry::Occupied(occupied) => *occupied.get(), - Entry::Vacant(vacant) => { - let idx = self.titles.len() as u32; - self.titles.push(title); - vacant.insert(idx); - idx - } - } + for line in r.lines() { + counter.tick(); + let page = serde_json::from_str::(&line?).unwrap(); + titles.push(page.title); } - fn get(&self, i: u32) -> &str { - &self.titles[i as usize] - } + counter.done(); + Ok(titles) } -fn first_stage() -> io::Result<(AdjacencyList, Titles)> { - let mut titles = Titles::default(); - let mut result = AdjacencyList::default(); +fn compute_title_lookup(normalizer: &TitleNormalizer, titles: &[String]) -> HashMap { + let mut counter = Counter::new(); + let mut title_lookup = HashMap::new(); - let stdin = BufReader::new(io::stdin()); - for (i, line) in stdin.lines().enumerate() { - let json_page = serde_json::from_str::(&line?).unwrap(); - - result.push_page(PageInfo { - id: json_page.id, - length: json_page.length, - redirect: json_page.redirect.is_some(), - title: json_page.title, - }); - - if let Some(to) = json_page.redirect { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo::default()); - } else { - for (to, start, len, flags) in json_page.links { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo { start, len, flags }); - } - } - - if (i + 1) % 100_000 == 0 { - eprintln!("{} pages imported", i + 1) - } - } - - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Titles: {}", titles.titles.len()); - eprintln!("Title map entries: {}", titles.map.len()); - - Ok((result, titles)) -} - -/// Create map from normalized title to index in pages. -fn initialize_pages_map(pages: &[Page]) -> FxHashMap { - let mut result = FxHashMap::default(); - for (i, p) in pages.iter().enumerate() { - match result.entry(util::normalize_link(&p.data.title)) { - Entry::Occupied(entry) => { - eprintln!( - "{:?} already exists at index {} as {:?}", - p.data.title, - entry.get(), - util::normalize_link(&p.data.title) - ); + for (i, title) in titles.iter().enumerate() { + counter.tick(); + match title_lookup.entry(normalizer.normalize(title)) { + Entry::Occupied(mut entry) => { + let prev_i = *entry.get(); + let prev = &titles[prev_i as usize]; + if prev == title { + println!(" {title:?} ({prev_i}) occurs again at {i}"); + // Prefer later occurrences of articles over earlier ones under + // the assumption that their contents are "fresher". + entry.insert(i as u32); + } else { + println!( + " {prev:?} ({prev_i}) and {title:?} ({i}) both normalize to {:?}", + normalizer.normalize(title) + ); + } } Entry::Vacant(entry) => { entry.insert(i as u32); } } } - result + + counter.done(); + title_lookup } -fn second_stage( - first_stage: &AdjacencyList, - titles: &Titles, -) -> AdjacencyList { - let pages_map = initialize_pages_map(&first_stage.pages); - let mut result = AdjacencyList::default(); +fn read_page_data( + normalizer: &TitleNormalizer, + title_lookup: &HashMap, + r: &mut BufReader, +) -> io::Result<(Vec, Vec, Graph)> { + let mut counter = Counter::new(); + let mut pages = vec![]; + let mut links = vec![]; + let mut graph = Graph::new(); - for (page_idx, page) in first_stage.pages() { - result.push_page(page.data.clone()); + for (i, line) in r.lines().enumerate() { + counter.tick(); + let page = serde_json::from_str::(&line?).unwrap(); + let normalized = normalizer.normalize(&page.title); - for (_, link) in first_stage.links(page_idx) { - let title = util::normalize_link(titles.get(link.to)); - if let Some(to) = pages_map.get(&title) { - // The link points to an existing article, we should keep it - result.push_link(*to, link.data); - } + let expected_i = title_lookup[&normalized]; + if i as u32 != expected_i { + // Articles may occur multiple times, and this is not the instance + // of the article we should keep. + println!(" Skipping {:?} ({i}) in favor of {expected_i}", page.title); + continue; } - if (page_idx + 1) % 100_000 == 0 { - eprintln!("{} pages imported", page_idx + 1) + graph.add_node(); + pages.push(Page { + id: page.id, + title: page.title, + length: page.length, + redirect: page.redirect.is_some(), + }); + + let mut page_links = page.links; + if let Some(target) = page.redirect { + page_links.clear(); + let len = target.len() as u32; + page_links.push((target, 0, len, 0)); + } + + for (target, start, len, flags) in page_links { + if let Some(target_i) = title_lookup.get(&normalizer.normalize(&target)) { + graph.edges.push(NodeIdx(*target_i)); + links.push(Link { start, len, flags }); + } } } - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Page map entries: {}", pages_map.len()); - - result + counter.done(); + Ok((pages, links, graph)) } -pub fn ingest(datafile: &Path) -> io::Result<()> { - eprintln!(">> First stage"); - let (first_stage, titles) = first_stage()?; - - eprintln!(">> Second stage"); - let data = second_stage(&first_stage, &titles); - - eprintln!(">> Consistency check"); - data.check_consistency(); - - eprintln!(">> Export"); - let mut datafile = BufWriter::new(File::create(datafile)?); - store::write_adjacency_list(&data, &mut datafile)?; - - Ok(()) +/// Convert sift data to brood data. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + /// The sift data file to ingest. + data: PathBuf, +} + +impl Cmd { + pub fn run(self, data: &Path) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> First pass"); + let mut sift_data = BufReader::new(File::open(&self.data)?); + + println!("> Reading titles"); + let titles = read_titles(&mut sift_data)?; + + println!("> Computing title index lookup table"); + let title_lookup = compute_title_lookup(&normalizer, &titles); + drop(titles); // Don't hoard memory + + println!(">> Second pass"); + sift_data.seek(io::SeekFrom::Start(0))?; + + println!("> Reading page data"); + let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + drop(title_lookup); // Don't hoard memory + drop(sift_data); // No longer needed + + println!("> Checking consistency"); + graph.check_consistency(); + + println!(">> Export"); + println!("Pages: {}", pages.len().separate_with_underscores()); + println!("Links: {}", links.len().separate_with_underscores()); + data::write_to_file(data, &pages, &links, &graph)?; + + Ok(()) + } } diff --git a/brood/src/data.rs b/brood/src/data.rs index 16aa0eb..69fc362 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,3 +1,177 @@ -pub mod adjacency_list; -pub mod info; -pub mod store; +use std::{ + fs::File, + io::{self, BufReader, BufWriter, Read, Write}, + path::Path, +}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +#[derive(Debug, Clone)] +pub struct Page { + pub id: u32, + pub title: String, + pub length: u32, + pub redirect: bool, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct Link { + pub start: u32, + pub len: u32, + pub flags: u8, +} + +impl Link { + pub fn in_parens(self) -> bool { + self.flags & 0b1 != 0 + } + + pub fn in_structure(self) -> bool { + self.flags & 0b10 != 0 + } +} + +struct Store<'a, W>(&'a mut W); + +fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u8(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 1]; + r.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) +} + +fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u16(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 2]; + r.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) +} + +fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u32(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 4]; + r.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(w, s.len() as u16)?; + w.write_all(s.as_bytes())?; + Ok(()) +} + +fn read_str(r: &mut impl Read) -> io::Result { + let len = read_u16(r)? as usize; + let mut buf = vec![0_u8; len]; + r.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) +} + +fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> { + write_u32(w, page.id)?; + write_u32(w, page.length)?; + write_u8(w, if page.redirect { 1 } else { 0 })?; + write_str(w, &page.title)?; + Ok(()) +} + +pub fn read_page(r: &mut impl Read) -> io::Result { + Ok(Page { + id: read_u32(r)?, + length: read_u32(r)?, + redirect: read_u8(r)? != 0, + title: read_str(r)?, + }) +} + +fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> { + write_u32(w, link.start)?; + write_u32(w, link.len)?; + write_u8(w, link.flags)?; + Ok(()) +} + +fn read_link(r: &mut impl Read) -> io::Result { + Ok(Link { + start: read_u32(r)?, + len: read_u32(r)?, + flags: read_u8(r)?, + }) +} + +fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { + assert!(pages.len() < u32::MAX as usize); + assert!(links.len() < u32::MAX as usize); + assert_eq!(pages.len(), graph.nodes.len()); + assert_eq!(links.len(), graph.edges.len()); + write_u32(w, pages.len() as u32)?; + write_u32(w, links.len() as u32)?; + + for page in pages { + write_page(w, page)?; + } + + for link in links { + write_link(w, link)?; + } + + for node in &graph.nodes { + write_u32(w, node.0)?; + } + + for edge in &graph.edges { + write_u32(w, edge.0)?; + } + + Ok(()) +} + +fn read(r: &mut impl Read) -> io::Result<(Vec, Vec, Graph)> { + let n_pages = read_u32(r)?; + let n_links = read_u32(r)?; + + let mut pages = Vec::with_capacity(n_pages as usize); + let mut links = Vec::with_capacity(n_links as usize); + let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + pages.push(read_page(r)?); + } + + for _ in 0..n_links { + links.push(read_link(r)?); + } + + for _ in 0..n_pages { + graph.nodes.push(EdgeIdx(read_u32(r)?)); + } + + for _ in 0..n_links { + graph.edges.push(NodeIdx(read_u32(r)?)); + } + + assert_eq!(pages.len(), graph.nodes.len()); + assert_eq!(links.len(), graph.edges.len()); + graph.check_consistency(); + Ok((pages, links, graph)) +} + +pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { + let mut file = BufWriter::new(File::create(path)?); + write(&mut file, pages, links, graph) +} + +pub fn read_from_file(path: &Path) -> io::Result<(Vec, Vec, Graph)> { + let mut file = BufReader::new(File::open(path)?); + read(&mut file) +} diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 9cd39d4..1cc25da 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -196,6 +196,10 @@ impl Graph { } } + pub fn add_node(&mut self) { + self.nodes.push(EdgeIdx::new(self.edges.len())); + } + pub fn check_consistency(&self) { if self.nodes.is_empty() { assert!(self.edges.is_empty(), "edges must belong to existing nodes"); diff --git a/brood/src/main.rs b/brood/src/main.rs index 3b93e2e..45bff55 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,62 +1,16 @@ mod algo; -pub mod commands; +mod commands; mod data; mod graph; mod util; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::{Path, PathBuf}; -use std::time::Instant; +use std::{io, path::PathBuf}; use clap::Parser; -use data::store; - -#[derive(Debug, PartialEq, Eq, Parser)] -pub enum PhilosophyGameCmd { - First, - Canonical, - Cluster, - Trace { start: String }, -} #[derive(Debug, Parser)] enum Command { - /// Read sift data on stdin and output brood data. - Ingest, - /// Read and reexport brood data. - Reexport { - to: PathBuf, - #[arg(long, short = 'P')] - in_parens: Option, - #[arg(long, short = 'S')] - in_structure: Option, - }, - /// Find a path from one article to another. - Path { - from: String, - to: String, - /// Flip start and end article. - #[arg(short, long)] - flip: bool, - }, - /// Find the longest shortest path starting at an article. - LongestShortestPath { - from: String, - }, - /// Analyze articles using "Philosophy Game" rules. - PhilosophyGame { - #[command(subcommand)] - subcmd: PhilosophyGameCmd, - }, - /// Print all page titles. - ListPages, - /// Print all links. - ListLinks { - /// The page to inspect. - page: String, - }, - Test, + Ingest(commands::ingest::Cmd), } #[derive(Debug, Parser)] @@ -69,42 +23,6 @@ struct Args { fn main() -> io::Result<()> { let args = Args::parse(); match args.command { - Command::Ingest => commands::ingest::ingest(&args.datafile), - Command::Reexport { - to, - in_parens, - in_structure, - } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure), - Command::Path { from, to, flip } => { - if flip { - commands::path::path(&args.datafile, &to, &from) - } else { - commands::path::path(&args.datafile, &from, &to) - } - } - Command::LongestShortestPath { from } => { - commands::longest_shortest_path::run(&args.datafile, &from) - } - Command::PhilosophyGame { subcmd } => { - commands::philosophy_game::run(&args.datafile, subcmd) - } - Command::ListPages => commands::list_pages::run(&args.datafile), - Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page), - Command::Test => test(&args.datafile), + Command::Ingest(cmd) => cmd.run(&args.datafile), } } - -fn test(datafile: &Path) -> io::Result<()> { - let a = Instant::now(); - // println!(">> Import adjacency list"); - // let mut databuf = BufReader::new(File::open(datafile)?); - // let adjlist = store::read_adjacency_list(&mut databuf)?; - println!(">> Import graph"); - let mut databuf = BufReader::new(File::open(datafile)?); - let (pages, links, graph) = store::read_graph(&mut databuf)?; - let b = Instant::now(); - - println!("{:?}", b.duration_since(a)); - - Ok(()) -} diff --git a/brood/src/util.rs b/brood/src/util.rs index e1a64ff..1cc1ab8 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,20 +1,151 @@ -use crate::data::{ - adjacency_list::{AdjacencyList, Page}, - info::{LinkInfo, PageInfo}, -}; +use std::{fmt, iter, time::Instant}; -pub fn normalize_link(link: &str) -> String { - let link = link.trim().replace(' ', "_"); +use regex::Regex; +use thousands::Separable; - // Make only first char lowercase - link.chars() - .next() - .iter() - .flat_map(|c| c.to_lowercase()) - .chain(link.chars().skip(1)) - .collect::() +pub struct Counter { + n: usize, + last_print: Instant, } +impl Counter { + pub fn new() -> Self { + Self { + n: 0, + last_print: Instant::now(), + } + } + + pub fn tick(&mut self) { + self.n += 1; + if self.n % 10_000 != 0 { + return; + } + + let now = Instant::now(); + if now.duration_since(self.last_print).as_secs() < 4 { + return; + } + + println!("{:>12}", self.n.separate_with_underscores()); + self.last_print = now; + } + + pub fn done(&self) { + println!("{:>12} (done)", self.n.separate_with_underscores()); + } +} + +// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js +struct PhpCharToUpper(char); + +impl fmt::Display for PhpCharToUpper { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.0 { + // Do something special, I guess + 'ᾀ' => write!(f, "ᾈ"), + 'ᾁ' => write!(f, "ᾉ"), + 'ᾂ' => write!(f, "ᾊ"), + 'ᾃ' => write!(f, "ᾋ"), + 'ᾄ' => write!(f, "ᾌ"), + 'ᾅ' => write!(f, "ᾍ"), + 'ᾆ' => write!(f, "ᾎ"), + 'ᾇ' => write!(f, "ᾏ"), + 'ᾐ' => write!(f, "ᾘ"), + 'ᾑ' => write!(f, "ᾙ"), + 'ᾒ' => write!(f, "ᾚ"), + 'ᾓ' => write!(f, "ᾛ"), + 'ᾔ' => write!(f, "ᾜ"), + 'ᾕ' => write!(f, "ᾝ"), + 'ᾖ' => write!(f, "ᾞ"), + 'ᾗ' => write!(f, "ᾟ"), + 'ᾠ' => write!(f, "ᾨ"), + 'ᾡ' => write!(f, "ᾩ"), + 'ᾢ' => write!(f, "ᾪ"), + 'ᾣ' => write!(f, "ᾫ"), + 'ᾤ' => write!(f, "ᾬ"), + 'ᾥ' => write!(f, "ᾭ"), + 'ᾦ' => write!(f, "ᾮ"), + 'ᾧ' => write!(f, "ᾯ"), + 'ᾳ' => write!(f, "ᾼ"), + 'ῃ' => write!(f, "ῌ"), + 'ῳ' => write!(f, "ῼ"), + + // Do not capitalize + 'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ' + | 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ' + | 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ' + | 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ' + | 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ' + | 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ' + | 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ' + | 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ' + | 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ' + | 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ' + | 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ' + | 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ' + | 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩' + | '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷' + | '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => { + write!(f, "{}", self.0) + } + + // Capitalize normally + c => write!(f, "{}", c.to_uppercase()), + } + } +} + +pub struct TitleNormalizer { + strip_bidi: Regex, + clean_up_whitespace: Regex, + trim_underscore_start: Regex, + trim_underscore_end: Regex, +} + +impl TitleNormalizer { + pub fn new() -> Self { + Self { + strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(), + + clean_up_whitespace: Regex::new(concat!( + "[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}", + "\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+" + )) + .unwrap(), + + trim_underscore_start: Regex::new("^_+").unwrap(), + + trim_underscore_end: Regex::new("_+$").unwrap(), + } + } + + /// Normalize an article title. + /// + /// See also . + pub fn normalize(&self, title: &str) -> String { + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403 + + // Strip Unicode bidi override characters + let title = self.strip_bidi.replace_all(title, ""); + + // Clean up whitespace + let title = self.clean_up_whitespace.replace_all(&title, "_"); + + // Trim _ from beginning and end + let title = self.trim_underscore_start.replace_all(&title, ""); + let title = self.trim_underscore_end.replace_all(&title, ""); + + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206 + let Some(first) = title.chars().next() else { + return String::new(); + }; + let rest = &title[first.len_utf8()..]; + format!("{}{rest}", PhpCharToUpper(first)) + } +} + +/* pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { let title = normalize_link(title); pages @@ -37,3 +168,4 @@ pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: return page_idx; } } +*/ From a3d0136ad265dbdc67c6dd0d0efb96b11de1afbe Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 01:34:25 +0100 Subject: [PATCH 11/36] Fix ingest logic and panics --- brood/src/commands/ingest.rs | 50 ++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 5407a8b..7df5d0f 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -37,31 +37,43 @@ fn read_titles(r: &mut BufReader) -> io::Result> { Ok(titles) } -fn compute_title_lookup(normalizer: &TitleNormalizer, titles: &[String]) -> HashMap { +/// Returns a map from normalized title to 1. the original index in the sift +/// data where the article should be taken from, and 2. the index in the brood +/// data where the article will appear. +fn compute_title_lookup( + normalizer: &TitleNormalizer, + titles: &[String], +) -> HashMap { let mut counter = Counter::new(); - let mut title_lookup = HashMap::new(); + let mut title_lookup = HashMap::::new(); - for (i, title) in titles.iter().enumerate() { + for (sift_i, title) in titles.iter().enumerate() { counter.tick(); + + // The index where this article will appear in the final list, assuming + // it is not a duplicate. For ownership reasons, we compute this here + // instead of inside the Entry::Vacant branch of the following match. + let brood_i = title_lookup.len(); + match title_lookup.entry(normalizer.normalize(title)) { + Entry::Vacant(entry) => { + entry.insert((sift_i as u32, brood_i as u32)); + } Entry::Occupied(mut entry) => { - let prev_i = *entry.get(); - let prev = &titles[prev_i as usize]; + let prev_sift_i = entry.get().0; + let prev = &titles[prev_sift_i as usize]; if prev == title { - println!(" {title:?} ({prev_i}) occurs again at {i}"); + println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}"); // Prefer later occurrences of articles over earlier ones under // the assumption that their contents are "fresher". - entry.insert(i as u32); + entry.get_mut().0 = sift_i as u32; } else { println!( - " {prev:?} ({prev_i}) and {title:?} ({i}) both normalize to {:?}", + " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}", normalizer.normalize(title) ); } } - Entry::Vacant(entry) => { - entry.insert(i as u32); - } } } @@ -71,7 +83,7 @@ fn compute_title_lookup(normalizer: &TitleNormalizer, titles: &[String]) -> Hash fn read_page_data( normalizer: &TitleNormalizer, - title_lookup: &HashMap, + title_lookup: &HashMap, r: &mut BufReader, ) -> io::Result<(Vec, Vec, Graph)> { let mut counter = Counter::new(); @@ -84,11 +96,11 @@ fn read_page_data( let page = serde_json::from_str::(&line?).unwrap(); let normalized = normalizer.normalize(&page.title); - let expected_i = title_lookup[&normalized]; - if i as u32 != expected_i { + let (sift_i, _) = title_lookup[&normalized]; + if i as u32 != sift_i { // Articles may occur multiple times, and this is not the instance // of the article we should keep. - println!(" Skipping {:?} ({i}) in favor of {expected_i}", page.title); + println!(" Skipping {:?} ({i}) in favor of {sift_i}", page.title); continue; } @@ -108,8 +120,8 @@ fn read_page_data( } for (target, start, len, flags) in page_links { - if let Some(target_i) = title_lookup.get(&normalizer.normalize(&target)) { - graph.edges.push(NodeIdx(*target_i)); + if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { + graph.edges.push(NodeIdx(*brood_i)); links.push(Link { start, len, flags }); } } @@ -152,8 +164,8 @@ impl Cmd { graph.check_consistency(); println!(">> Export"); - println!("Pages: {}", pages.len().separate_with_underscores()); - println!("Links: {}", links.len().separate_with_underscores()); + println!("Pages: {:>13}", pages.len().separate_with_underscores()); + println!("Links: {:>13}", links.len().separate_with_underscores()); data::write_to_file(data, &pages, &links, &graph)?; Ok(()) From eb631250d70fae4b87d94594d8b848500ba95841 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 02:18:39 +0100 Subject: [PATCH 12/36] Fix ingest logic yet again --- brood/src/commands/ingest.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 7df5d0f..7e0f223 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -37,9 +37,14 @@ fn read_titles(r: &mut BufReader) -> io::Result> { Ok(titles) } -/// Returns a map from normalized title to 1. the original index in the sift -/// data where the article should be taken from, and 2. the index in the brood -/// data where the article will appear. +/// Returns a map from normalized title to the index in the brood data where the +/// article will appear. +/// +/// Titles in the title list are not always unique. When multiple identical +/// titles appear, all but one have to be discarded. Originally, I tried to be +/// smart and keep the last occurrence (under the assumption that its data would +/// be the newest), but this led to index-based bugs. Because of this, I now +/// keep the first occurrence. fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], @@ -59,14 +64,11 @@ fn compute_title_lookup( Entry::Vacant(entry) => { entry.insert((sift_i as u32, brood_i as u32)); } - Entry::Occupied(mut entry) => { + Entry::Occupied(entry) => { let prev_sift_i = entry.get().0; let prev = &titles[prev_sift_i as usize]; if prev == title { println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}"); - // Prefer later occurrences of articles over earlier ones under - // the assumption that their contents are "fresher". - entry.get_mut().0 = sift_i as u32; } else { println!( " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}", @@ -157,6 +159,7 @@ impl Cmd { println!("> Reading page data"); let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + assert_eq!(pages.len(), title_lookup.len()); drop(title_lookup); // Don't hoard memory drop(sift_data); // No longer needed From e04215802e2ffeeb12a6c85d304b66fa5ccc569a Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 02:37:15 +0100 Subject: [PATCH 13/36] Speed up ingest using rustc_hash An enwiki ingest went from ca. 6:50 minutes down to ca. 7:00 minutes. Oh wait... This was not a rigorous test, but rustc_hash doesn't seem to have a significant positive impact. Maybe I'm just holding it wrong, but right now I'd rather remove it again and have simpler code/deps. --- brood/src/commands/ingest.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 7e0f223..fd26d39 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,10 +1,11 @@ use std::{ - collections::{hash_map::Entry, HashMap}, + collections::hash_map::Entry, fs::File, io::{self, BufRead, BufReader, Seek}, path::{Path, PathBuf}, }; +use rustc_hash::FxHashMap; use serde::Deserialize; use thousands::Separable; @@ -48,9 +49,9 @@ fn read_titles(r: &mut BufReader) -> io::Result> { fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], -) -> HashMap { +) -> FxHashMap { let mut counter = Counter::new(); - let mut title_lookup = HashMap::::new(); + let mut title_lookup = FxHashMap::::default(); for (sift_i, title) in titles.iter().enumerate() { counter.tick(); @@ -85,7 +86,7 @@ fn compute_title_lookup( fn read_page_data( normalizer: &TitleNormalizer, - title_lookup: &HashMap, + title_lookup: &FxHashMap, r: &mut BufReader, ) -> io::Result<(Vec, Vec, Graph)> { let mut counter = Counter::new(); From abd6b3519c249198353a49d3f782bae1e9675934 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 02:48:07 +0100 Subject: [PATCH 14/36] Get rid of rustc_hash --- brood/Cargo.lock | 7 ------- brood/Cargo.toml | 1 - brood/src/commands/ingest.rs | 9 ++++----- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 16c8cc8..414bb49 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -66,7 +66,6 @@ version = "0.0.0" dependencies = [ "clap", "regex", - "rustc-hash", "serde", "serde_json", "thousands", @@ -189,12 +188,6 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" -[[package]] -name = "rustc-hash" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" - [[package]] name = "ryu" version = "1.0.18" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index a560f73..0dd4156 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -6,7 +6,6 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } regex = "1.11.1" -rustc-hash = "2.1.0" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" thousands = "0.2.0" diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index fd26d39..7e0f223 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,11 +1,10 @@ use std::{ - collections::hash_map::Entry, + collections::{hash_map::Entry, HashMap}, fs::File, io::{self, BufRead, BufReader, Seek}, path::{Path, PathBuf}, }; -use rustc_hash::FxHashMap; use serde::Deserialize; use thousands::Separable; @@ -49,9 +48,9 @@ fn read_titles(r: &mut BufReader) -> io::Result> { fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], -) -> FxHashMap { +) -> HashMap { let mut counter = Counter::new(); - let mut title_lookup = FxHashMap::::default(); + let mut title_lookup = HashMap::::new(); for (sift_i, title) in titles.iter().enumerate() { counter.tick(); @@ -86,7 +85,7 @@ fn compute_title_lookup( fn read_page_data( normalizer: &TitleNormalizer, - title_lookup: &FxHashMap, + title_lookup: &HashMap, r: &mut BufReader, ) -> io::Result<(Vec, Vec, Graph)> { let mut counter = Counter::new(); From 4e41084f2a44477a5845feb5a90db7c17c63a027 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 02:48:28 +0100 Subject: [PATCH 15/36] Port path command --- brood/src/commands.rs | 1 + brood/src/commands/path.rs | 114 ++++++++++++++++--------------------- brood/src/main.rs | 2 + brood/src/util.rs | 38 ++++++++----- 4 files changed, 77 insertions(+), 78 deletions(-) diff --git a/brood/src/commands.rs b/brood/src/commands.rs index b3ac910..2e77470 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1 +1,2 @@ pub mod ingest; +pub mod path; diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 55c72ed..7a5dcb9 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,77 +1,61 @@ -use std::{ - fs::File, - io::{self, BufReader}, - path::Path, -}; +use std::{io, path::Path}; use crate::{ algo::Dijkstra, - data::{info::PageInfo, store}, - graph::{Graph, NodeIdx}, - util, + data, + util::{self, TitleNormalizer}, }; -pub fn find_index_of_title(pages: &[PageInfo], title: &str) -> NodeIdx { - let title = util::normalize_link(title); - pages - .iter() - .enumerate() - .find(|(_, p)| util::normalize_link(&p.title) == title) - .map(|(i, _)| NodeIdx::new(i)) - .expect("invalid title") +/// Find the shortest path between two articles. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + start: String, + goal: String, } -pub fn resolve_redirects(pages: &[PageInfo], graph: &Graph, mut page: NodeIdx) -> NodeIdx { - loop { - if pages[page.usize()].redirect { - if let Some(next) = graph.edges_for(page).first() { - page = *next; - continue; +impl Cmd { + pub fn run(self, data: &Path) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Import"); + let (pages, _links, graph) = data::read_from_file(data)?; + + println!(">> Resolve articles"); + let start = util::resolve_title(&normalizer, &pages, &graph, &self.start); + let goal = util::resolve_title(&normalizer, &pages, &graph, &self.goal); + println!("Start: {}", pages[start.usize()].title); + println!("Goal: {}", pages[goal.usize()].title); + + println!(">> Find path"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !pages[source.usize()].redirect as u32, + ); + + if dijkstra.cost(goal) == u32::MAX { + println!("No path found"); + return Ok(()); + } + + println!("> Collecting path"); + let path = dijkstra.path(goal); + let cost = dijkstra.cost(goal); + + println!(); + println!("Path found (cost {cost}, length {}):", path.len()); + for page in path { + let info = &pages[page.usize()]; + if info.redirect { + println!("v {:?}", info.title); + } else { + println!("- {:?}", info.title); } } - return page; + Ok(()) } } - -pub fn path(datafile: &Path, start: &str, goal: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let (pages, _links, graph) = store::read_graph(&mut databuf)?; - - println!(">> Locate from and to"); - let start = resolve_redirects(&pages, &graph, find_index_of_title(&pages, start)); - let goal = resolve_redirects(&pages, &graph, find_index_of_title(&pages, goal)); - println!("Start: {:?}", pages[start.usize()].title); - println!("Goal: {:?}", pages[goal.usize()].title); - - println!(">> Find path"); - println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&graph); - println!("> Running dijkstra"); - dijkstra.run( - start, - |node| node == goal, - |source, _edge, _target| !pages[source.usize()].redirect as u32, - ); - - if dijkstra.cost(goal) == u32::MAX { - println!("No path found"); - return Ok(()); - } - - println!("> Collecting path"); - let path = dijkstra.path(goal); - let cost = dijkstra.cost(goal); - println!("Path found (cost {cost}, length {}):", path.len()); - for page in path { - let info = &pages[page.usize()]; - if info.redirect { - println!(" v {:?}", info.title); - } else { - println!(" - {:?}", info.title); - } - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 45bff55..c31b1f4 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -11,6 +11,7 @@ use clap::Parser; #[derive(Debug, Parser)] enum Command { Ingest(commands::ingest::Cmd), + Path(commands::path::Cmd), } #[derive(Debug, Parser)] @@ -24,5 +25,6 @@ fn main() -> io::Result<()> { let args = Args::parse(); match args.command { Command::Ingest(cmd) => cmd.run(&args.datafile), + Command::Path(cmd) => cmd.run(&args.datafile), } } diff --git a/brood/src/util.rs b/brood/src/util.rs index 1cc1ab8..f594058 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,8 +1,13 @@ -use std::{fmt, iter, time::Instant}; +use std::{fmt, iter, thread::panicking, time::Instant}; use regex::Regex; use thousands::Separable; +use crate::{ + data::Page, + graph::{Graph, NodeIdx}, +}; + pub struct Counter { n: usize, last_print: Instant, @@ -145,27 +150,34 @@ impl TitleNormalizer { } } -/* -pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { - let title = normalize_link(title); +pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) -> NodeIdx { + let normalized = normalizer.normalize(title); pages .iter() .enumerate() - .find(|(_, p)| normalize_link(&p.data.title) == title) - .map(|(i, _)| i) - .expect("invalid title") as u32 + .find(|(_, p)| normalizer.normalize(&p.title) == normalized) + .map(|(i, _)| NodeIdx::new(i)) + .expect("invalid title") } -pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: u32) -> u32 { +pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> NodeIdx { loop { - if data.page(page_idx).data.redirect { - if let Some(link_idx) = data.link_redirect(page_idx) { - page_idx = data.link(link_idx).to; + if pages[page.usize()].redirect { + if let Some(target) = graph.edges_for(page).first() { + page = *target; continue; } } - return page_idx; + return page; } } -*/ + +pub fn resolve_title( + normalizer: &TitleNormalizer, + pages: &[Page], + graph: &Graph, + title: &str, +) -> NodeIdx { + resolve_redirects(pages, graph, locate_title(normalizer, pages, title)) +} From 6ca20c97406e3fccb93e2bba35d69f5d1b4b4fa5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 11:20:09 +0100 Subject: [PATCH 16/36] Remove some old code --- brood/src/data/adjacency_list.rs | 196 ------------------------------- brood/src/data/info.rs | 24 ---- brood/src/data/store.rs | 160 ------------------------- 3 files changed, 380 deletions(-) delete mode 100644 brood/src/data/adjacency_list.rs delete mode 100644 brood/src/data/info.rs delete mode 100644 brood/src/data/store.rs diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs deleted file mode 100644 index 2de1631..0000000 --- a/brood/src/data/adjacency_list.rs +++ /dev/null @@ -1,196 +0,0 @@ -use std::ops::Range; - -use super::info::{LinkInfo, PageInfo}; - -#[derive(Debug, Clone, Copy)] -pub struct Page

{ - /// Index of the first link belonging to this page. - pub start: u32, - pub data: P, -} - -impl

Page

{ - pub fn change_data(self, f: &impl Fn(P) -> P2) -> Page { - Page { - start: self.start, - data: f(self.data), - } - } -} - -#[derive(Debug, Clone, Copy)] -pub struct Link { - /// Index of the page this link points to. - pub to: u32, - pub data: L, -} - -impl Link { - pub fn change_data(self, f: &impl Fn(L) -> L2) -> Link { - Link { - to: self.to, - data: f(self.data), - } - } - - pub fn change_data_with_page(self, page: &P, f: &impl Fn(&P, L) -> L2) -> Link { - Link { - to: self.to, - data: f(page, self.data), - } - } -} - -pub struct AdjacencyList { - pub pages: Vec>, - pub links: Vec>, -} - -impl Default for AdjacencyList { - fn default() -> Self { - Self { - pages: vec![], - links: vec![], - } - } -} - -impl AdjacencyList { - pub fn push_page(&mut self, data: P) { - self.pages.push(Page { - start: self.links.len() as u32, - data, - }); - } - - pub fn push_link(&mut self, to: u32, data: L) { - self.links.push(Link { to, data }) - } - - pub fn page(&self, page_idx: u32) -> &Page

{ - &self.pages[page_idx as usize] - } - - pub fn page_mut(&mut self, page_idx: u32) -> &mut Page

{ - &mut self.pages[page_idx as usize] - } - - pub fn pages(&self) -> impl Iterator)> { - self.pages.iter().enumerate().map(|(i, p)| (i as u32, p)) - } - - pub fn link(&self, link_idx: u32) -> &Link { - &self.links[link_idx as usize] - } - - pub fn link_mut(&mut self, link_idx: u32) -> &mut Link { - &mut self.links[link_idx as usize] - } - - pub fn link_range(&self, page_idx: u32) -> Range { - let start_idx = self.pages[page_idx as usize].start; - let end_idx = match self.pages.get(page_idx as usize + 1) { - Some(page) => page.start, - None => self.links.len() as u32, - }; - start_idx..end_idx - } - - pub fn link_redirect(&self, page_idx: u32) -> Option { - let range = self.link_range(page_idx); - if range.is_empty() { - None - } else { - Some(range.start) - } - } - - pub fn links(&self, page_idx: u32) -> impl Iterator)> { - self.link_range(page_idx).map(|i| (i, self.link(i))) - } - - pub fn change_page_data(self, page_f: impl Fn(P) -> P2) -> AdjacencyList { - let pages = self - .pages - .into_iter() - .map(|p| p.change_data(&page_f)) - .collect::>(); - - AdjacencyList { - pages, - links: self.links, - } - } - - pub fn change_link_data(self, link_f: impl Fn(L) -> L2) -> AdjacencyList { - let links = self - .links - .into_iter() - .map(|l| l.change_data(&link_f)) - .collect::>(); - - AdjacencyList { - pages: self.pages, - links, - } - } - - pub fn change_link_data_with_page( - self, - link_f: impl Fn(&P, L) -> L2, - ) -> AdjacencyList { - let mut pages = self.pages.iter().peekable(); - let Some(mut cur_page) = pages.next() else { - // The list is empty, nothing to do - return AdjacencyList::default(); - }; - - let mut links = vec![]; - - for (i, link) in self.links.into_iter().enumerate() { - if let Some(page) = pages.peek() { - if i >= page.start as usize { - cur_page = page; - pages.next(); - } - } - - links.push(link.change_data_with_page(&cur_page.data, &link_f)); - } - - AdjacencyList { - pages: self.pages, - links, - } - } -} - -impl AdjacencyList { - pub fn check_consistency(&self) { - // Check that all types are large enough - assert!(self.pages.len() < u32::MAX as usize, "too many pages"); - assert!(self.links.len() < u32::MAX as usize, "too many links"); - for page in &self.pages { - assert!( - page.data.title.len() <= u8::MAX as usize, - "page title too long" - ); - } - - // Check that all links contain valid indices. Links must not link to - // the sentinel page. - let range = 0..self.pages.len() as u32; - for link in &self.links { - assert!(range.contains(&link.to), "invalid link"); - } - - // Check that all redirect pages have at most one link - for (page_idx, page) in self.pages.iter().enumerate() { - if page.data.redirect { - let range = self.link_range(page_idx as u32); - let amount = range.end - range.start; - assert!(amount <= 1, "too many redirect links"); - } - } - } -} diff --git a/brood/src/data/info.rs b/brood/src/data/info.rs deleted file mode 100644 index dad04d4..0000000 --- a/brood/src/data/info.rs +++ /dev/null @@ -1,24 +0,0 @@ -#[derive(Debug, Clone)] -pub struct PageInfo { - pub id: u32, - pub title: String, - pub length: u32, - pub redirect: bool, -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct LinkInfo { - pub start: u32, - pub len: u32, - pub flags: u8, -} - -impl LinkInfo { - pub fn in_parens(self) -> bool { - self.flags & 0b1 != 0 - } - - pub fn in_structure(self) -> bool { - self.flags & 0b10 != 0 - } -} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs deleted file mode 100644 index 06a35eb..0000000 --- a/brood/src/data/store.rs +++ /dev/null @@ -1,160 +0,0 @@ -use std::io::{self, Read, Write}; - -use crate::graph::{EdgeIdx, Graph, NodeIdx}; - -use super::{ - adjacency_list::{AdjacencyList, Link, Page}, - info::{LinkInfo, PageInfo}, -}; - -fn write_u8(n: u8, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u8(from: &mut R) -> io::Result { - let mut buf = [0_u8; 1]; - from.read_exact(&mut buf)?; - Ok(u8::from_le_bytes(buf)) -} - -fn write_u16(n: u16, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u16(from: &mut R) -> io::Result { - let mut buf = [0_u8; 2]; - from.read_exact(&mut buf)?; - Ok(u16::from_le_bytes(buf)) -} - -fn write_u32(n: u32, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u32(from: &mut R) -> io::Result { - let mut buf = [0_u8; 4]; - from.read_exact(&mut buf)?; - Ok(u32::from_le_bytes(buf)) -} - -fn write_str(s: &str, to: &mut W) -> io::Result<()> { - assert!(s.len() <= u16::MAX as usize); - write_u16(s.len() as u16, to)?; - to.write_all(s.as_bytes())?; - Ok(()) -} - -fn read_str(from: &mut R) -> io::Result { - let len = read_u16(from)? as usize; - let mut buf = vec![0_u8; len]; - from.read_exact(&mut buf)?; - Ok(String::from_utf8(buf).unwrap()) -} - -fn write_page(page: &Page, to: &mut W) -> io::Result<()> { - write_u32(page.start, to)?; - write_u32(page.data.id, to)?; - write_u32(page.data.length, to)?; - write_u8(if page.data.redirect { 1 } else { 0 }, to)?; - write_str(&page.data.title, to)?; - - Ok(()) -} - -pub fn read_page(from: &mut R) -> io::Result> { - let start_link_idx = read_u32(from)?; - let id = read_u32(from)?; - let length = read_u32(from)?; - let redirect = read_u8(from)? != 0; - let title = read_str(from)?; - - Ok(Page { - start: start_link_idx, - data: PageInfo { - id, - length, - redirect, - title, - }, - }) -} - -fn write_link(link: &Link, to: &mut W) -> io::Result<()> { - write_u32(link.to, to)?; - write_u32(link.data.start, to)?; - write_u32(link.data.len, to)?; - write_u8(link.data.flags, to)?; - - Ok(()) -} - -fn read_link(from: &mut R) -> io::Result> { - let to_page_idx = read_u32(from)?; - let start = read_u32(from)?; - let len = read_u32(from)?; - let flags = read_u8(from)?; - - Ok(Link { - to: to_page_idx, - data: LinkInfo { start, len, flags }, - }) -} - -pub fn write_adjacency_list( - al: &AdjacencyList, - to: &mut W, -) -> io::Result<()> { - write_u32(al.pages.len() as u32, to)?; - write_u32(al.links.len() as u32, to)?; - - for page in &al.pages { - write_page(page, to)?; - } - - for link in &al.links { - write_link(link, to)?; - } - - Ok(()) -} - -pub fn read_adjacency_list(from: &mut R) -> io::Result> { - let n_pages = read_u32(from)?; - let n_links = read_u32(from)?; - - let mut pages = vec![]; - for _ in 0..n_pages { - pages.push(read_page(from)?); - } - - let mut links = vec![]; - for _ in 0..n_links { - links.push(read_link(from)?); - } - - Ok(AdjacencyList { pages, links }) -} - -pub fn read_graph(from: &mut impl Read) -> io::Result<(Vec, Vec, Graph)> { - let n_pages = read_u32(from)?; - let n_links = read_u32(from)?; - - let mut pages = Vec::with_capacity(n_pages as usize); - let mut links = Vec::with_capacity(n_links as usize); - let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); - - for _ in 0..n_pages { - let page = read_page(from)?; - graph.nodes.push(EdgeIdx(page.start)); - pages.push(page.data); - } - - for _ in 0..n_links { - let link = read_link(from)?; - graph.edges.push(NodeIdx(link.to)); - links.push(link.data); - } - - graph.check_consistency(); - Ok((pages, links, graph)) -} From 01683735094b398bb279145eff11a28f80385e11 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 11:26:32 +0100 Subject: [PATCH 17/36] Move dijkstra to new file --- brood/src/algo.rs | 78 +------------------------------------- brood/src/algo/dijkstra.rs | 77 +++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 76 deletions(-) create mode 100644 brood/src/algo/dijkstra.rs diff --git a/brood/src/algo.rs b/brood/src/algo.rs index b6bf26a..ffc1aa5 100644 --- a/brood/src/algo.rs +++ b/brood/src/algo.rs @@ -1,77 +1,3 @@ -use std::{cmp::Reverse, collections::BinaryHeap}; +mod dijkstra; -use crate::graph::{EdgeIdx, Graph, NodeIdx}; - -pub struct Dijkstra<'a> { - graph: &'a Graph, - cost: Vec, - pred: Vec, -} - -impl<'a> Dijkstra<'a> { - pub fn new(graph: &'a Graph) -> Self { - Self { - graph, - cost: vec![u32::MAX; graph.nodes.len()], - pred: vec![NodeIdx::NONE; graph.nodes.len()], - } - } - - pub fn run( - &mut self, - start: NodeIdx, - goal: impl Fn(NodeIdx) -> bool, - cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, - ) { - self.cost[start.usize()] = 0; - let mut queue = BinaryHeap::new(); - queue.push((Reverse(0), start)); - - while let Some((Reverse(curr_cost), curr)) = queue.pop() { - if goal(curr) { - break; // We've found the shortest path to our target - } - - // These seem to never actually occur - // if curr_cost > self.cost[curr.usize()] { - // continue; // Outdated entry - // } - - for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { - let next = self.graph.edges[edge.usize()]; - let next_cost = curr_cost + cost(curr, edge, next); - if next_cost < self.cost[next.usize()] { - self.cost[next.usize()] = next_cost; - self.pred[next.usize()] = curr; - queue.push((Reverse(next_cost), next)); - } - } - } - } - - #[inline] - pub fn cost(&self, node: NodeIdx) -> u32 { - self.cost[node.usize()] - } - - #[inline] - pub fn pred(&self, node: NodeIdx) -> NodeIdx { - self.pred[node.usize()] - } - - pub fn path(&self, goal: NodeIdx) -> Vec { - let mut path = vec![]; - let mut at = goal; - - loop { - path.push(at); - at = self.pred(at); - if at == NodeIdx::NONE { - break; - } - } - - path.reverse(); - path - } -} +pub use self::dijkstra::*; diff --git a/brood/src/algo/dijkstra.rs b/brood/src/algo/dijkstra.rs new file mode 100644 index 0000000..b6bf26a --- /dev/null +++ b/brood/src/algo/dijkstra.rs @@ -0,0 +1,77 @@ +use std::{cmp::Reverse, collections::BinaryHeap}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +pub struct Dijkstra<'a> { + graph: &'a Graph, + cost: Vec, + pred: Vec, +} + +impl<'a> Dijkstra<'a> { + pub fn new(graph: &'a Graph) -> Self { + Self { + graph, + cost: vec![u32::MAX; graph.nodes.len()], + pred: vec![NodeIdx::NONE; graph.nodes.len()], + } + } + + pub fn run( + &mut self, + start: NodeIdx, + goal: impl Fn(NodeIdx) -> bool, + cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, + ) { + self.cost[start.usize()] = 0; + let mut queue = BinaryHeap::new(); + queue.push((Reverse(0), start)); + + while let Some((Reverse(curr_cost), curr)) = queue.pop() { + if goal(curr) { + break; // We've found the shortest path to our target + } + + // These seem to never actually occur + // if curr_cost > self.cost[curr.usize()] { + // continue; // Outdated entry + // } + + for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { + let next = self.graph.edges[edge.usize()]; + let next_cost = curr_cost + cost(curr, edge, next); + if next_cost < self.cost[next.usize()] { + self.cost[next.usize()] = next_cost; + self.pred[next.usize()] = curr; + queue.push((Reverse(next_cost), next)); + } + } + } + } + + #[inline] + pub fn cost(&self, node: NodeIdx) -> u32 { + self.cost[node.usize()] + } + + #[inline] + pub fn pred(&self, node: NodeIdx) -> NodeIdx { + self.pred[node.usize()] + } + + pub fn path(&self, goal: NodeIdx) -> Vec { + let mut path = vec![]; + let mut at = goal; + + loop { + path.push(at); + at = self.pred(at); + if at == NodeIdx::NONE { + break; + } + } + + path.reverse(); + path + } +} From aa4187fcd890839e1c17d7858b0bef835394e056 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 12:16:34 +0100 Subject: [PATCH 18/36] Group pages, links, and graph in Data struct --- brood/src/commands/ingest.rs | 40 ++++++----- brood/src/commands/path.rs | 18 ++--- brood/src/data.rs | 129 ++++++++++++++++++++--------------- brood/src/graph.rs | 17 +++-- brood/src/util.rs | 21 +++--- 5 files changed, 124 insertions(+), 101 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 7e0f223..42fc13a 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use thousands::Separable; use crate::{ - data::{self, Link, Page}, - graph::{Graph, NodeIdx}, + data::{Data, Link, Page}, + graph::NodeIdx, util::{Counter, TitleNormalizer}, }; @@ -87,11 +87,9 @@ fn read_page_data( normalizer: &TitleNormalizer, title_lookup: &HashMap, r: &mut BufReader, -) -> io::Result<(Vec, Vec, Graph)> { +) -> io::Result { let mut counter = Counter::new(); - let mut pages = vec![]; - let mut links = vec![]; - let mut graph = Graph::new(); + let mut data = Data::new(); for (i, line) in r.lines().enumerate() { counter.tick(); @@ -106,8 +104,8 @@ fn read_page_data( continue; } - graph.add_node(); - pages.push(Page { + data.graph.add_node(); + data.pages.push(Page { id: page.id, title: page.title, length: page.length, @@ -123,14 +121,14 @@ fn read_page_data( for (target, start, len, flags) in page_links { if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { - graph.edges.push(NodeIdx(*brood_i)); - links.push(Link { start, len, flags }); + data.graph.edges.push(NodeIdx(*brood_i)); + data.links.push(Link { start, len, flags }); } } } counter.done(); - Ok((pages, links, graph)) + Ok(data) } /// Convert sift data to brood data. @@ -141,7 +139,7 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, brood_data: &Path) -> io::Result<()> { let normalizer = TitleNormalizer::new(); println!(">> First pass"); @@ -158,18 +156,24 @@ impl Cmd { sift_data.seek(io::SeekFrom::Start(0))?; println!("> Reading page data"); - let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; - assert_eq!(pages.len(), title_lookup.len()); + let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + assert_eq!(data.pages.len(), title_lookup.len()); drop(title_lookup); // Don't hoard memory drop(sift_data); // No longer needed println!("> Checking consistency"); - graph.check_consistency(); + data.graph.check_consistency(); println!(">> Export"); - println!("Pages: {:>13}", pages.len().separate_with_underscores()); - println!("Links: {:>13}", links.len().separate_with_underscores()); - data::write_to_file(data, &pages, &links, &graph)?; + println!( + "Pages: {:>13}", + data.pages.len().separate_with_underscores() + ); + println!( + "Links: {:>13}", + data.links.len().separate_with_underscores() + ); + data.write_to_file(brood_data)?; Ok(()) } diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 7a5dcb9..882203c 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -2,7 +2,7 @@ use std::{io, path::Path}; use crate::{ algo::Dijkstra, - data, + data::Data, util::{self, TitleNormalizer}, }; @@ -18,22 +18,22 @@ impl Cmd { let normalizer = TitleNormalizer::new(); println!(">> Import"); - let (pages, _links, graph) = data::read_from_file(data)?; + let data = Data::read_from_file(data)?; println!(">> Resolve articles"); - let start = util::resolve_title(&normalizer, &pages, &graph, &self.start); - let goal = util::resolve_title(&normalizer, &pages, &graph, &self.goal); - println!("Start: {}", pages[start.usize()].title); - println!("Goal: {}", pages[goal.usize()].title); + let start = util::resolve_title(&normalizer, &data, &self.start); + let goal = util::resolve_title(&normalizer, &data, &self.goal); + println!("Start: {}", data.pages[start.usize()].title); + println!("Goal: {}", data.pages[goal.usize()].title); println!(">> Find path"); println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&graph); + let mut dijkstra = Dijkstra::new(&data.graph); println!("> Running dijkstra"); dijkstra.run( start, |node| node == goal, - |source, _edge, _target| !pages[source.usize()].redirect as u32, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, ); if dijkstra.cost(goal) == u32::MAX { @@ -48,7 +48,7 @@ impl Cmd { println!(); println!("Path found (cost {cost}, length {}):", path.len()); for page in path { - let info = &pages[page.usize()]; + let info = &data.pages[page.usize()]; if info.redirect { println!("v {:?}", info.title); } else { diff --git a/brood/src/data.rs b/brood/src/data.rs index 69fc362..20c95a6 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -109,69 +109,88 @@ fn read_link(r: &mut impl Read) -> io::Result { }) } -fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { - assert!(pages.len() < u32::MAX as usize); - assert!(links.len() < u32::MAX as usize); - assert_eq!(pages.len(), graph.nodes.len()); - assert_eq!(links.len(), graph.edges.len()); - write_u32(w, pages.len() as u32)?; - write_u32(w, links.len() as u32)?; - - for page in pages { - write_page(w, page)?; - } - - for link in links { - write_link(w, link)?; - } - - for node in &graph.nodes { - write_u32(w, node.0)?; - } - - for edge in &graph.edges { - write_u32(w, edge.0)?; - } - - Ok(()) +#[derive(Default)] +pub struct Data { + pub pages: Vec, + pub links: Vec, + pub graph: Graph, } -fn read(r: &mut impl Read) -> io::Result<(Vec, Vec, Graph)> { - let n_pages = read_u32(r)?; - let n_links = read_u32(r)?; - - let mut pages = Vec::with_capacity(n_pages as usize); - let mut links = Vec::with_capacity(n_links as usize); - let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); - - for _ in 0..n_pages { - pages.push(read_page(r)?); +impl Data { + pub fn new() -> Self { + Self::default() } - for _ in 0..n_links { - links.push(read_link(r)?); + pub fn with_capacity(pages: usize, links: usize) -> Self { + Self { + pages: Vec::with_capacity(pages), + links: Vec::with_capacity(links), + graph: Graph::with_capacity(pages, links), + } } - for _ in 0..n_pages { - graph.nodes.push(EdgeIdx(read_u32(r)?)); + fn write(&self, w: &mut impl Write) -> io::Result<()> { + assert!(self.pages.len() < u32::MAX as usize); + assert!(self.links.len() < u32::MAX as usize); + assert_eq!(self.pages.len(), self.graph.nodes.len()); + assert_eq!(self.links.len(), self.graph.edges.len()); + write_u32(w, self.pages.len() as u32)?; + write_u32(w, self.links.len() as u32)?; + + for page in &self.pages { + write_page(w, page)?; + } + + for link in &self.links { + write_link(w, link)?; + } + + for node in &self.graph.nodes { + write_u32(w, node.0)?; + } + + for edge in &self.graph.edges { + write_u32(w, edge.0)?; + } + + Ok(()) } - for _ in 0..n_links { - graph.edges.push(NodeIdx(read_u32(r)?)); + fn read(r: &mut impl Read) -> io::Result { + let n_pages = read_u32(r)?; + let n_links = read_u32(r)?; + + let mut result = Self::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + result.pages.push(read_page(r)?); + } + + for _ in 0..n_links { + result.links.push(read_link(r)?); + } + + for _ in 0..n_pages { + result.graph.nodes.push(EdgeIdx(read_u32(r)?)); + } + + for _ in 0..n_links { + result.graph.edges.push(NodeIdx(read_u32(r)?)); + } + + assert_eq!(result.pages.len(), result.graph.nodes.len()); + assert_eq!(result.links.len(), result.graph.edges.len()); + result.graph.check_consistency(); + Ok(result) } - assert_eq!(pages.len(), graph.nodes.len()); - assert_eq!(links.len(), graph.edges.len()); - graph.check_consistency(); - Ok((pages, links, graph)) -} - -pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { - let mut file = BufWriter::new(File::create(path)?); - write(&mut file, pages, links, graph) -} - -pub fn read_from_file(path: &Path) -> io::Result<(Vec, Vec, Graph)> { - let mut file = BufReader::new(File::open(path)?); - read(&mut file) + pub fn write_to_file(&self, path: &Path) -> io::Result<()> { + let mut file = BufWriter::new(File::create(path)?); + self.write(&mut file) + } + + pub fn read_from_file(path: &Path) -> io::Result { + let mut file = BufReader::new(File::open(path)?); + Self::read(&mut file) + } } diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 1cc25da..ed6f559 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -1,6 +1,6 @@ use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct NodeIdx(pub u32); impl NodeIdx { @@ -85,7 +85,7 @@ impl SubAssign for NodeIdx { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct EdgeIdx(pub u32); impl EdgeIdx { @@ -242,6 +242,11 @@ impl Graph { Edges::new(self) } + pub fn edges_for(&self, node: NodeIdx) -> impl Iterator + '_ { + self.edge_range(node) + .map(|i| (EdgeIdx::new(i), self.edges[i])) + } + pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { self.nodes .get(node.usize()) @@ -255,7 +260,7 @@ impl Graph { start.usize()..end.usize() } - pub fn edges_for(&self, node: NodeIdx) -> &[NodeIdx] { + pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] { &self.edges[self.edge_range(node)] } } @@ -283,15 +288,15 @@ impl Iterator for Edges<'_> { if self.ei.usize() >= self.graph.edges.len() { return None; } - let to = self.graph.edges[self.ei.usize()]; + let target = self.graph.edges[self.ei.usize()]; // if would not be sufficient because some nodes may not have any edges. while self.ei >= self.graph.edge_start(self.ni + 1) { self.ni += 1; } - let from = self.ni; + let source = self.ni; self.ei += 1; - Some((from, to)) + Some((source, target)) } } diff --git a/brood/src/util.rs b/brood/src/util.rs index f594058..d908a42 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -4,7 +4,7 @@ use regex::Regex; use thousands::Separable; use crate::{ - data::Page, + data::{Data, Page}, graph::{Graph, NodeIdx}, }; @@ -150,9 +150,9 @@ impl TitleNormalizer { } } -pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) -> NodeIdx { +pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { let normalized = normalizer.normalize(title); - pages + data.pages .iter() .enumerate() .find(|(_, p)| normalizer.normalize(&p.title) == normalized) @@ -160,10 +160,10 @@ pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) - .expect("invalid title") } -pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> NodeIdx { +pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx { loop { - if pages[page.usize()].redirect { - if let Some(target) = graph.edges_for(page).first() { + if data.pages[page.usize()].redirect { + if let Some(target) = data.graph.edge_slice(page).first() { page = *target; continue; } @@ -173,11 +173,6 @@ pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> No } } -pub fn resolve_title( - normalizer: &TitleNormalizer, - pages: &[Page], - graph: &Graph, - title: &str, -) -> NodeIdx { - resolve_redirects(pages, graph, locate_title(normalizer, pages, title)) +pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { + resolve_redirects(data, locate_title(normalizer, data, title)) } From ceb987bbbc8a6c47d6126843fc4c218e70d80540 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 14:48:59 +0100 Subject: [PATCH 19/36] Add show command --- brood/src/commands.rs | 1 + brood/src/commands/show.rs | 79 ++++++++++++++++++++++++++++++++++++++ brood/src/data.rs | 8 ++++ brood/src/main.rs | 2 + brood/src/util.rs | 12 ++---- 5 files changed, 93 insertions(+), 9 deletions(-) create mode 100644 brood/src/commands/show.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 2e77470..d72d397 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,2 +1,3 @@ pub mod ingest; pub mod path; +pub mod show; diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs new file mode 100644 index 0000000..2e14aed --- /dev/null +++ b/brood/src/commands/show.rs @@ -0,0 +1,79 @@ +use std::{io, path::Path}; + +use thousands::Separable; + +use crate::{ + data::Data, + util::{self, TitleNormalizer}, +}; + +/// Show info about a specific article. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + title: String, +} + +impl Cmd { + pub fn run(self, data: &Path) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Import"); + let data = Data::read_from_file(data)?; + + println!(">> Locate article"); + let mut node = util::locate_title(&normalizer, &data, &self.title); + + loop { + let page = &data.pages[node.usize()]; + + const W_LABEL: usize = 12; + const W_NUM: usize = 11; + + println!(); + + println!("{:>W_LABEL$}: {}", "Title", page.title); + + println!( + "{:>W_LABEL$}: {}", + "Title (norm)", + normalizer.normalize(&page.title) + ); + + println!("{:>W_LABEL$}: {}", "Redirect", page.redirect); + + println!("{:>W_LABEL$}: {:>W_NUM$}", "ID", page.id); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Length", + page.length.separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links (out)", + data.graph + .edge_range(node) + .len() + .separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links (in)", + data.graph + .edges() + .filter(|(_, target)| *target == node) + .count() + .separate_with_underscores() + ); + + node = match data.redirect_target(node) { + Some(target) => target, + None => break, + }; + } + + Ok(()) + } +} diff --git a/brood/src/data.rs b/brood/src/data.rs index 20c95a6..2c3213c 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -193,4 +193,12 @@ impl Data { let mut file = BufReader::new(File::open(path)?); Self::read(&mut file) } + + pub fn redirect_target(&self, node: NodeIdx) -> Option { + if !self.pages[node.usize()].redirect { + return None; + } + + self.graph.edge_slice(node).first().copied() + } } diff --git a/brood/src/main.rs b/brood/src/main.rs index c31b1f4..db547ce 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -11,6 +11,7 @@ use clap::Parser; #[derive(Debug, Parser)] enum Command { Ingest(commands::ingest::Cmd), + Show(commands::show::Cmd), Path(commands::path::Cmd), } @@ -25,6 +26,7 @@ fn main() -> io::Result<()> { let args = Args::parse(); match args.command { Command::Ingest(cmd) => cmd.run(&args.datafile), + Command::Show(cmd) => cmd.run(&args.datafile), Command::Path(cmd) => cmd.run(&args.datafile), } } diff --git a/brood/src/util.rs b/brood/src/util.rs index d908a42..0f76c67 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -161,16 +161,10 @@ pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> N } pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx { - loop { - if data.pages[page.usize()].redirect { - if let Some(target) = data.graph.edge_slice(page).first() { - page = *target; - continue; - } - } - - return page; + while let Some(target) = data.redirect_target(page) { + page = target; } + page } pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { From 693ae9eb813b3aa22e10b7e60f5cbba0553a349b Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:13:11 +0100 Subject: [PATCH 20/36] Show more info and optionally links --- brood/src/commands/list_links.rs | 87 ----------------------------- brood/src/commands/path.rs | 7 +-- brood/src/commands/show.rs | 95 ++++++++++++++++++++++++++++---- brood/src/util.rs | 8 +++ 4 files changed, 94 insertions(+), 103 deletions(-) delete mode 100644 brood/src/commands/list_links.rs diff --git a/brood/src/commands/list_links.rs b/brood/src/commands/list_links.rs deleted file mode 100644 index 37c9972..0000000 --- a/brood/src/commands/list_links.rs +++ /dev/null @@ -1,87 +0,0 @@ -use std::{ - collections::HashSet, - fs::File, - io::{self, BufReader}, - path::Path, -}; - -use crate::{ - data::{ - adjacency_list::AdjacencyList, - info::{LinkInfo, PageInfo}, - store, - }, - util, -}; - -fn links_from(data: &AdjacencyList, idx: u32) -> HashSet { - data.links(idx).map(|(_, ld)| ld.to).collect() -} - -fn links_to(data: &AdjacencyList, idx: u32) -> HashSet { - let mut links = HashSet::::new(); - for (pi, _) in data.pages() { - for (_, ld) in data.links(pi) { - if ld.to == idx { - links.insert(pi); - continue; - } - } - } - links -} - -fn print_links(data: &AdjacencyList, name: &str, links: &HashSet) { - let mut links = links - .iter() - .map(|pi| { - let page = data.page(*pi); - (&page.data.title as &str, page.data.redirect) - }) - .collect::>(); - - links.sort(); - - println!(">> {name} ({}):", links.len()); - for (title, redirect) in links { - if redirect { - println!("v {title}"); - } else { - println!("- {title}"); - } - } -} - -pub fn run(datafile: &Path, page: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - println!(">> Locate page"); - let idx = util::resolve_redirects(&data, util::find_index_of_title(&data.pages, page)); - println!("Page: {:?}", data.page(idx).data.title); - - println!(">> Find links"); - let from = links_from(&data, idx); - let to = links_to(&data, idx); - let twins = from.intersection(&to).copied().collect::>(); - let twinless_from = from.difference(&twins).copied().collect::>(); - let twinless_to = to.difference(&twins).copied().collect::>(); - - println!(); - print_links(&data, "From", &from); - - println!(); - print_links(&data, "To", &to); - - println!(); - print_links(&data, "Twins", &twins); - - println!(); - print_links(&data, "From without twins", &twinless_from); - - println!(); - print_links(&data, "To without twins", &twinless_to); - - Ok(()) -} diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 882203c..ebea2a6 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -48,12 +48,7 @@ impl Cmd { println!(); println!("Path found (cost {cost}, length {}):", path.len()); for page in path { - let info = &data.pages[page.usize()]; - if info.redirect { - println!("v {:?}", info.title); - } else { - println!("- {:?}", info.title); - } + println!("{}", util::fmt_page(&data.pages[page.usize()])); } Ok(()) diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs index 2e14aed..894d11d 100644 --- a/brood/src/commands/show.rs +++ b/brood/src/commands/show.rs @@ -1,4 +1,4 @@ -use std::{io, path::Path}; +use std::{collections::HashSet, io, path::Path}; use thousands::Separable; @@ -11,6 +11,10 @@ use crate::{ #[derive(Debug, clap::Parser)] pub struct Cmd { title: String, + + /// Print links in more detail. + #[arg(long, short)] + links: bool, } impl Cmd { @@ -49,25 +53,96 @@ impl Cmd { page.length.separate_with_underscores() ); + let outlinks = data.graph.edge_slice(node).to_vec(); + let inlinks = data + .graph + .edges() + .filter(|(_, target)| *target == node) + .map(|(source, _)| source) + .collect::>(); + + let outlinks_set = outlinks.iter().copied().collect::>(); + let inlinks_set = inlinks.iter().copied().collect::>(); + let twins_set = outlinks_set + .intersection(&inlinks_set) + .copied() + .collect::>(); + println!( "{:>W_LABEL$}: {:>W_NUM$}", "Links (out)", - data.graph - .edge_range(node) - .len() - .separate_with_underscores() + outlinks.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "unique", + outlinks_set.len().separate_with_underscores() ); println!( "{:>W_LABEL$}: {:>W_NUM$}", "Links (in)", - data.graph - .edges() - .filter(|(_, target)| *target == node) - .count() - .separate_with_underscores() + inlinks.len().separate_with_underscores() ); + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "unique", + inlinks_set.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Twins", + twins_set.len().separate_with_underscores() + ); + + if self.links { + let mut twin_pages = twins_set + .iter() + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + let mut outlink_only_pages = outlinks_set + .difference(&twins_set) + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + let mut inlink_only_pages = inlinks_set + .difference(&twins_set) + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + twin_pages.sort_by_key(|p| &p.title); + outlink_only_pages.sort_by_key(|p| &p.title); + inlink_only_pages.sort_by_key(|p| &p.title); + + println!(); + println!("Twins ({}):", twin_pages.len().separate_with_underscores()); + for page in twin_pages { + println!("{}", util::fmt_page(page)); + } + + println!(); + println!( + "Only outlinks ({}):", + outlink_only_pages.len().separate_with_underscores() + ); + for page in outlink_only_pages { + println!("{}", util::fmt_page(page)); + } + + println!(); + println!( + "Only inlinks ({}):", + inlink_only_pages.len().separate_with_underscores() + ); + for page in inlink_only_pages { + println!("{}", util::fmt_page(page)); + } + } + node = match data.redirect_target(node) { Some(target) => target, None => break, diff --git a/brood/src/util.rs b/brood/src/util.rs index 0f76c67..2a8f1d0 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -170,3 +170,11 @@ pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx { pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { resolve_redirects(data, locate_title(normalizer, data, title)) } + +pub fn fmt_page(page: &Page) -> String { + if page.redirect { + format!("v {}", page.title) + } else { + format!("- {}", page.title) + } +} From ab7b7295ca9e037864b903631c8d56fbc6261df6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:13:17 +0100 Subject: [PATCH 21/36] Remove unused code --- brood/src/data.rs | 2 -- brood/src/graph.rs | 6 ------ brood/src/util.rs | 4 ++-- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/brood/src/data.rs b/brood/src/data.rs index 2c3213c..091354b 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -31,8 +31,6 @@ impl Link { } } -struct Store<'a, W>(&'a mut W); - fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> { w.write_all(&n.to_le_bytes()) } diff --git a/brood/src/graph.rs b/brood/src/graph.rs index ed6f559..95c53e1 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -89,8 +89,6 @@ impl SubAssign for NodeIdx { pub struct EdgeIdx(pub u32); impl EdgeIdx { - pub const NONE: Self = Self(u32::MAX); - #[inline] pub const fn new(value: usize) -> Self { Self(value as u32) @@ -185,10 +183,6 @@ pub struct Graph { } impl Graph { - pub fn new() -> Self { - Self::default() - } - pub fn with_capacity(nodes: usize, edges: usize) -> Self { Self { nodes: Vec::with_capacity(nodes), diff --git a/brood/src/util.rs b/brood/src/util.rs index 2a8f1d0..cb5ef33 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,11 +1,11 @@ -use std::{fmt, iter, thread::panicking, time::Instant}; +use std::{fmt, time::Instant}; use regex::Regex; use thousands::Separable; use crate::{ data::{Data, Page}, - graph::{Graph, NodeIdx}, + graph::NodeIdx, }; pub struct Counter { From c573f1b0b020ef824d1be45df8e1f3881da784c5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:30:11 +0100 Subject: [PATCH 22/36] Allow transforming graph before commands --- brood/src/algo.rs | 3 +- brood/src/algo/edit.rs | 74 ++++++++++++++++++++++++++++++++++++ brood/src/commands/ingest.rs | 6 +-- brood/src/commands/path.rs | 7 +--- brood/src/commands/show.rs | 7 +--- brood/src/data.rs | 16 ++++++++ brood/src/graph.rs | 4 ++ brood/src/main.rs | 34 +++++++++++++++-- 8 files changed, 134 insertions(+), 17 deletions(-) create mode 100644 brood/src/algo/edit.rs diff --git a/brood/src/algo.rs b/brood/src/algo.rs index ffc1aa5..ac1919f 100644 --- a/brood/src/algo.rs +++ b/brood/src/algo.rs @@ -1,3 +1,4 @@ mod dijkstra; +mod edit; -pub use self::dijkstra::*; +pub use self::{dijkstra::*, edit::*}; diff --git a/brood/src/algo/edit.rs b/brood/src/algo/edit.rs new file mode 100644 index 0000000..2b44298 --- /dev/null +++ b/brood/src/algo/edit.rs @@ -0,0 +1,74 @@ +use std::mem; + +use crate::{ + data::{Data, Link}, + graph::NodeIdx, + util, +}; + +pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) { + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + if f(&link) { + data.links.push(link); + data.graph.add_edge(*edge); + } + } + } +} + +pub fn resolve_redirects(data: &mut Data) { + // Permutation from input node to input node + let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()]; + for node in data.graph.nodes() { + perm_redirect[node.usize()] = util::resolve_redirects(data, node); + } + + // Permutation from input node to final node + let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()]; + let mut perm_retain_count = NodeIdx(0); + for (i, page) in data.pages.iter().enumerate() { + if !page.redirect { + perm_retain[i] = perm_retain_count; + perm_retain_count += 1; + } + } + + let mut pages = mem::take(&mut data.pages).into_iter(); + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + let page = pages.next().unwrap(); + let new_node = perm_retain[node.usize()]; + + if new_node == NodeIdx::NONE { + // Skip all edges + for _ in graph.edge_slice(node) { + links.next().unwrap(); + } + continue; + } + + data.pages.push(page); + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + let new_edge = perm_retain[perm_redirect[edge.usize()].usize()]; + + if new_edge == NodeIdx::NONE { + continue; + } + + data.links.push(link); + data.graph.add_edge(new_edge); + } + } +} diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 42fc13a..2036062 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -121,7 +121,7 @@ fn read_page_data( for (target, start, len, flags) in page_links { if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { - data.graph.edges.push(NodeIdx(*brood_i)); + data.graph.add_edge(NodeIdx(*brood_i)); data.links.push(Link { start, len, flags }); } } @@ -139,7 +139,7 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, brood_data: &Path) -> io::Result<()> { + pub fn run(&self, brood_data: &Path) -> io::Result<()> { let normalizer = TitleNormalizer::new(); println!(">> First pass"); @@ -162,7 +162,7 @@ impl Cmd { drop(sift_data); // No longer needed println!("> Checking consistency"); - data.graph.check_consistency(); + data.check_consistency(); println!(">> Export"); println!( diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index ebea2a6..d21ba95 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,4 +1,4 @@ -use std::{io, path::Path}; +use std::io; use crate::{ algo::Dijkstra, @@ -14,12 +14,9 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, data: Data) -> io::Result<()> { let normalizer = TitleNormalizer::new(); - println!(">> Import"); - let data = Data::read_from_file(data)?; - println!(">> Resolve articles"); let start = util::resolve_title(&normalizer, &data, &self.start); let goal = util::resolve_title(&normalizer, &data, &self.goal); diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs index 894d11d..0c67388 100644 --- a/brood/src/commands/show.rs +++ b/brood/src/commands/show.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, io, path::Path}; +use std::{collections::HashSet, io}; use thousands::Separable; @@ -18,12 +18,9 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, data: Data) -> io::Result<()> { let normalizer = TitleNormalizer::new(); - println!(">> Import"); - let data = Data::read_from_file(data)?; - println!(">> Locate article"); let mut node = util::locate_title(&normalizer, &data, &self.title); diff --git a/brood/src/data.rs b/brood/src/data.rs index 091354b..c253094 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -192,6 +192,22 @@ impl Data { Self::read(&mut file) } + pub fn check_consistency(&self) { + assert_eq!( + self.pages.len(), + self.graph.nodes.len(), + "inconsistent number of pages" + ); + + assert_eq!( + self.links.len(), + self.graph.edges.len(), + "inconsistent number of links" + ); + + self.graph.check_consistency(); + } + pub fn redirect_target(&self, node: NodeIdx) -> Option { if !self.pages[node.usize()].redirect { return None; diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 95c53e1..620b81a 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -194,6 +194,10 @@ impl Graph { self.nodes.push(EdgeIdx::new(self.edges.len())); } + pub fn add_edge(&mut self, target: NodeIdx) { + self.edges.push(target); + } + pub fn check_consistency(&self) { if self.nodes.is_empty() { assert!(self.edges.is_empty(), "edges must belong to existing nodes"); diff --git a/brood/src/main.rs b/brood/src/main.rs index db547ce..9f1af1e 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -7,6 +7,7 @@ mod util; use std::{io, path::PathBuf}; use clap::Parser; +use data::Data; #[derive(Debug, Parser)] enum Command { @@ -20,13 +21,40 @@ struct Args { datafile: PathBuf, #[command(subcommand)] command: Command, + #[arg(long, short = 'P')] + in_parens: Option, + #[arg(long, short = 'S')] + in_structure: Option, + #[arg(long, short = 'R')] + resolve_redirects: bool, } fn main() -> io::Result<()> { let args = Args::parse(); + + if let Command::Ingest(cmd) = &args.command { + return cmd.run(&args.datafile); + } + + println!(">> Import"); + let mut data = Data::read_from_file(&args.datafile)?; + + if args.in_parens.is_some() || args.in_structure.is_some() { + println!("> Filtering edges"); + algo::retain_edges(&mut data, |link| { + args.in_parens.is_none_or(|b| b == link.in_parens()) + && args.in_structure.is_none_or(|b| b == link.in_structure()) + }); + } + + if args.resolve_redirects { + println!("> Resolving redirects"); + algo::resolve_redirects(&mut data); + } + match args.command { - Command::Ingest(cmd) => cmd.run(&args.datafile), - Command::Show(cmd) => cmd.run(&args.datafile), - Command::Path(cmd) => cmd.run(&args.datafile), + Command::Ingest(_) => unreachable!(), + Command::Show(cmd) => cmd.run(data), + Command::Path(cmd) => cmd.run(data), } } From 535d7ff236e0feaab026fd28c3e111dfadbe7b23 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:38:03 +0100 Subject: [PATCH 23/36] Add export command --- brood/src/commands.rs | 1 + brood/src/commands/export.rs | 17 ++++++++++++ brood/src/commands/reexport.rs | 48 ---------------------------------- brood/src/main.rs | 2 ++ 4 files changed, 20 insertions(+), 48 deletions(-) create mode 100644 brood/src/commands/export.rs delete mode 100644 brood/src/commands/reexport.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index d72d397..cc694c1 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,3 +1,4 @@ +pub mod export; pub mod ingest; pub mod path; pub mod show; diff --git a/brood/src/commands/export.rs b/brood/src/commands/export.rs new file mode 100644 index 0000000..aad5dd8 --- /dev/null +++ b/brood/src/commands/export.rs @@ -0,0 +1,17 @@ +use std::{io, path::PathBuf}; + +use crate::data::Data; + +#[derive(Debug, clap::Parser)] +pub struct Cmd { + out: PathBuf, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + println!(">> Export"); + data.write_to_file(&self.out)?; + + Ok(()) + } +} diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs deleted file mode 100644 index 1125fb0..0000000 --- a/brood/src/commands/reexport.rs +++ /dev/null @@ -1,48 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader, BufWriter}; -use std::path::Path; - -use crate::data::adjacency_list::AdjacencyList; -use crate::data::store; - -pub fn reexport( - from: &Path, - to: &Path, - in_parens: Option, - in_structure: Option, -) -> io::Result<()> { - eprintln!(">> Import"); - let mut from = BufReader::new(File::open(from)?); - let mut data = store::read_adjacency_list(&mut from)?; - - eprintln!(">> Consistency check"); - data.check_consistency(); - - if in_parens.is_some() || in_structure.is_some() { - eprintln!(">> Filtering"); - - let mut data2 = AdjacencyList::default(); - for (page_idx, page) in data.pages() { - data2.push_page(page.data.clone()); - for (_, link) in data.links(page_idx) { - if in_parens.is_some_and(|v| v != link.data.in_parens()) { - continue; - } - - if in_structure.is_some_and(|v| v != link.data.in_structure()) { - continue; - } - - data2.push_link(link.to, link.data); - } - } - - data = data2; - } - - eprintln!(">> Export"); - let mut to = BufWriter::new(File::create(to)?); - store::write_adjacency_list(&data, &mut to)?; - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 9f1af1e..f0e1a30 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -12,6 +12,7 @@ use data::Data; #[derive(Debug, Parser)] enum Command { Ingest(commands::ingest::Cmd), + Export(commands::export::Cmd), Show(commands::show::Cmd), Path(commands::path::Cmd), } @@ -54,6 +55,7 @@ fn main() -> io::Result<()> { match args.command { Command::Ingest(_) => unreachable!(), + Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), } From 6611dd31600972698424e7ddbb390d61fcf07c5b Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:51:58 +0100 Subject: [PATCH 24/36] Add --bidi flag to path command --- brood/src/commands/path.rs | 76 +++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index d21ba95..4f58bb6 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -3,6 +3,7 @@ use std::io; use crate::{ algo::Dijkstra, data::Data, + graph::NodeIdx, util::{self, TitleNormalizer}, }; @@ -11,6 +12,46 @@ use crate::{ pub struct Cmd { start: String, goal: String, + + // Search for a path in both directions. + #[arg(long, short)] + bidi: bool, +} + +fn search_path(data: &Data, start: NodeIdx, goal: NodeIdx) -> Option<(u32, Vec)> { + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&data.graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, + ); + + if dijkstra.cost(goal) == u32::MAX { + return None; + } + + println!("> Collecting path"); + let cost = dijkstra.cost(goal); + let path = dijkstra.path(goal); + Some((cost, path)) +} + +fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { + let start = &data.pages[start.usize()].title; + let goal = &data.pages[goal.usize()].title; + + let Some((cost, path)) = path else { + println!("No path found from {start} to {goal}"); + return; + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + + for page in path { + println!("{}", util::fmt_page(&data.pages[page.usize()])); + } } impl Cmd { @@ -23,29 +64,22 @@ impl Cmd { println!("Start: {}", data.pages[start.usize()].title); println!("Goal: {}", data.pages[goal.usize()].title); - println!(">> Find path"); - println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&data.graph); - println!("> Running dijkstra"); - dijkstra.run( - start, - |node| node == goal, - |source, _edge, _target| !data.pages[source.usize()].redirect as u32, - ); + if self.bidi { + println!(">> Find path forward"); + let forward = search_path(&data, start, goal); + println!(">> Find path backward"); + let backward = search_path(&data, goal, start); - if dijkstra.cost(goal) == u32::MAX { - println!("No path found"); - return Ok(()); - } + println!(); + print_path(&data, start, goal, forward); + println!(); + print_path(&data, goal, start, backward); + } else { + println!(">> Find path"); + let path = search_path(&data, start, goal); - println!("> Collecting path"); - let path = dijkstra.path(goal); - let cost = dijkstra.cost(goal); - - println!(); - println!("Path found (cost {cost}, length {}):", path.len()); - for page in path { - println!("{}", util::fmt_page(&data.pages[page.usize()])); + println!(); + print_path(&data, start, goal, path); } Ok(()) From 1f20e0519a1c1a1cd9413472f3926b11119ec4a9 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 16:40:42 +0100 Subject: [PATCH 25/36] Fix off-by-one in consistency check --- brood/src/graph.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 620b81a..e7849fb 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -215,8 +215,8 @@ impl Graph { for (ni, node) in self.nodes.iter().cloned().enumerate() { assert!( - node.usize() < self.edges.len(), - "node pointers must in range" + node.usize() <= self.edges.len(), + "node pointers must be in range" ); if let Some(succ) = self.nodes.get(ni + 1) { From 04482f9f2f4fd49bdb77e0fb4fd3ed68bbc085ac Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 16:40:55 +0100 Subject: [PATCH 26/36] Detect redirect cycles --- brood/src/util.rs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/brood/src/util.rs b/brood/src/util.rs index cb5ef33..bf42980 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,4 +1,4 @@ -use std::{fmt, time::Instant}; +use std::{collections::HashSet, fmt, time::Instant}; use regex::Regex; use thousands::Separable; @@ -160,11 +160,25 @@ pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> N .expect("invalid title") } -pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx { - while let Some(target) = data.redirect_target(page) { - page = target; +pub fn resolve_redirects(data: &Data, node: NodeIdx) -> NodeIdx { + let mut curr = node; + let mut seen = HashSet::new(); + + seen.insert(curr); + while let Some(target) = data.redirect_target(curr) { + if seen.contains(&target) { + println!( + " Redirect cycle deteted: {:?}", + data.pages[node.usize()].title + ); + break; + } + + seen.insert(target); + curr = target; } - page + + curr } pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { From e90864a0976c6fad8eb12b4af0b0f913abfeda03 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 16:41:19 +0100 Subject: [PATCH 27/36] Add -I and -c cli options --- brood/src/algo/edit.rs | 23 +++++++++++++++++++++++ brood/src/main.rs | 15 +++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/brood/src/algo/edit.rs b/brood/src/algo/edit.rs index 2b44298..2be0c0a 100644 --- a/brood/src/algo/edit.rs +++ b/brood/src/algo/edit.rs @@ -72,3 +72,26 @@ pub fn resolve_redirects(data: &mut Data) { } } } + +pub fn invert(data: &mut Data) { + let links = mem::take(&mut data.links); + let graph = mem::take(&mut data.graph); + + let mut edges = graph + .edges() + .zip(links) + .map(|((source, target), link)| (source, target, link)) + .collect::>(); + + edges.sort_by_key(|(_, target, _)| *target); + + let mut edges = edges.into_iter().peekable(); + for node in graph.nodes() { + data.graph.add_node(); + while edges.peek().is_some_and(|(_, target, _)| *target <= node) { + let (source, _, link) = edges.next().unwrap(); + data.graph.add_edge(source); + data.links.push(link); + } + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index f0e1a30..757695b 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -28,6 +28,10 @@ struct Args { in_structure: Option, #[arg(long, short = 'R')] resolve_redirects: bool, + #[arg(long, short = 'I')] + invert_edges: bool, + #[arg(long, short)] + check_consistency: bool, } fn main() -> io::Result<()> { @@ -38,6 +42,7 @@ fn main() -> io::Result<()> { } println!(">> Import"); + println!("> Reading data"); let mut data = Data::read_from_file(&args.datafile)?; if args.in_parens.is_some() || args.in_structure.is_some() { @@ -53,6 +58,16 @@ fn main() -> io::Result<()> { algo::resolve_redirects(&mut data); } + if args.invert_edges { + println!("> Inverting edges"); + algo::invert(&mut data); + } + + if args.check_consistency { + println!("> Checking consistencey"); + data.check_consistency(); + } + match args.command { Command::Ingest(_) => unreachable!(), Command::Export(cmd) => cmd.run(data), From 76efd6d728bcd55ce72c809de43ac99c235fed1f Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 17:14:21 +0100 Subject: [PATCH 28/36] Add redirects command --- brood/src/commands.rs | 1 + brood/src/commands/redirects.rs | 107 ++++++++++++++++++++++++++++++++ brood/src/main.rs | 2 + 3 files changed, 110 insertions(+) create mode 100644 brood/src/commands/redirects.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index cc694c1..f58324f 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,4 +1,5 @@ pub mod export; pub mod ingest; pub mod path; +pub mod redirects; pub mod show; diff --git a/brood/src/commands/redirects.rs b/brood/src/commands/redirects.rs new file mode 100644 index 0000000..aeab362 --- /dev/null +++ b/brood/src/commands/redirects.rs @@ -0,0 +1,107 @@ +use std::{cmp::Reverse, collections::HashSet, io}; + +use thousands::Separable; + +use crate::{data::Data, graph::NodeIdx, util}; + +fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> { + let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new(); + + for node in data.graph.nodes() { + if !data.pages[node.usize()].redirect { + continue; + } + + let mut seen = HashSet::new(); + + let mut curr = node; + seen.insert(node); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + println!(" Redirect loop: {}", data.pages[node.usize()].title); + break; + } + + curr = next; + seen.insert(next); + } + + redirects.push((node, curr, seen.len() - 1)); + } + + redirects +} + +fn follow_redirect(data: &Data, start: NodeIdx) -> Vec { + let mut seen = HashSet::new(); + let mut nodes = Vec::new(); + + let mut curr = start; + seen.insert(curr); + nodes.push(curr); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + break; + } + + curr = next; + seen.insert(curr); + nodes.push(curr); + } + + nodes +} + +/// Show interesting redirect stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[arg(long, short)] + long: bool, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + println!(">> Resolving redirects"); + let redirects = find_redirects(&data); + + println!(); + println!( + "There is a total of {} redirects.", + redirects.len().separate_with_underscores() + ); + + let mut long = redirects + .iter() + .filter(|(_, _, l)| *l > 1) + .collect::>(); + long.sort_by_key(|(_, _, l)| Reverse(l)); + + println!( + "{} redirects take more than one step to reach an article.", + long.len().separate_with_underscores() + ); + + println!( + "The longest redirect chain takes {} steps.", + long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0), + ); + + println!("Though these redirect chains are usually swiftly fixed by bots."); + + if self.long { + println!(); + println!("Redirect chains with length > 1:"); + + for (start, _, _) in long { + println!(); + for step in follow_redirect(&data, *start) { + println!("{}", util::fmt_page(&data.pages[step.usize()])); + } + } + } + + Ok(()) + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 757695b..a84ee1b 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -15,6 +15,7 @@ enum Command { Export(commands::export::Cmd), Show(commands::show::Cmd), Path(commands::path::Cmd), + Redirects(commands::redirects::Cmd), } #[derive(Debug, Parser)] @@ -73,5 +74,6 @@ fn main() -> io::Result<()> { Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), + Command::Redirects(cmd) => cmd.run(data), } } From 5b8feb63682b1911baf89d8a9c9b1e59e8fbe68f Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 17:38:27 +0100 Subject: [PATCH 29/36] Add general stats command --- brood/src/commands.rs | 2 +- brood/src/commands/stats.rs | 95 +++++++++++++++++++++ brood/src/commands/{ => stats}/redirects.rs | 6 +- brood/src/main.rs | 4 +- 4 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 brood/src/commands/stats.rs rename brood/src/commands/{ => stats}/redirects.rs (96%) diff --git a/brood/src/commands.rs b/brood/src/commands.rs index f58324f..6c9bd5e 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,5 +1,5 @@ pub mod export; pub mod ingest; pub mod path; -pub mod redirects; pub mod show; +pub mod stats; diff --git a/brood/src/commands/stats.rs b/brood/src/commands/stats.rs new file mode 100644 index 0000000..5ee1272 --- /dev/null +++ b/brood/src/commands/stats.rs @@ -0,0 +1,95 @@ +mod redirects; + +use std::io; + +use thousands::Separable; + +use crate::data::Data; + +#[derive(Debug, clap::Parser)] +enum Command { + Redirects(redirects::Cmd), +} + +/// Show interesting stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[command(subcommand)] + command: Option, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + if let Some(cmd) = self.command { + return match cmd { + Command::Redirects(cmd) => cmd.run(data), + }; + } + + println!(); + + const W_LABEL: usize = 14; + const W_NUM: usize = 11; + + let n_pages = data.pages.len(); + let n_redirects = data.pages.iter().filter(|p| p.redirect).count(); + let n_articles = n_pages - n_redirects; + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Pages", + n_pages.separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Articles", + n_articles.separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Redirects", + n_redirects.separate_with_underscores() + ); + + println!(); + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links", + data.links.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "in parens", + data.links + .iter() + .filter(|l| l.in_parens()) + .count() + .separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "in structures", + data.links + .iter() + .filter(|l| l.in_structure()) + .count() + .separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "pg eligible", + data.links + .iter() + .filter(|l| !l.in_parens() && !l.in_structure()) + .count() + .separate_with_underscores() + ); + + Ok(()) + } +} diff --git a/brood/src/commands/redirects.rs b/brood/src/commands/stats/redirects.rs similarity index 96% rename from brood/src/commands/redirects.rs rename to brood/src/commands/stats/redirects.rs index aeab362..6bf2204 100644 --- a/brood/src/commands/redirects.rs +++ b/brood/src/commands/stats/redirects.rs @@ -54,19 +54,19 @@ fn follow_redirect(data: &Data, start: NodeIdx) -> Vec { nodes } -/// Show interesting redirect stats. +/// Show redirect stats. #[derive(Debug, clap::Parser)] pub struct Cmd { + /// Show more detailed info. #[arg(long, short)] long: bool, } impl Cmd { pub fn run(self, data: Data) -> io::Result<()> { - println!(">> Resolving redirects"); + println!(">> Resolve redirects"); let redirects = find_redirects(&data); - println!(); println!( "There is a total of {} redirects.", redirects.len().separate_with_underscores() diff --git a/brood/src/main.rs b/brood/src/main.rs index a84ee1b..ba71e52 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -15,7 +15,7 @@ enum Command { Export(commands::export::Cmd), Show(commands::show::Cmd), Path(commands::path::Cmd), - Redirects(commands::redirects::Cmd), + Stats(commands::stats::Cmd), } #[derive(Debug, Parser)] @@ -74,6 +74,6 @@ fn main() -> io::Result<()> { Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), - Command::Redirects(cmd) => cmd.run(data), + Command::Stats(cmd) => cmd.run(data), } } From cdf9a7d7ae2a48d9f435edc522e2ac502e07ac31 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 18:26:12 +0100 Subject: [PATCH 30/36] Compute article rankings by link degrees --- brood/src/commands/stats.rs | 3 ++ brood/src/commands/stats/degrees.rs | 84 +++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 brood/src/commands/stats/degrees.rs diff --git a/brood/src/commands/stats.rs b/brood/src/commands/stats.rs index 5ee1272..760cec6 100644 --- a/brood/src/commands/stats.rs +++ b/brood/src/commands/stats.rs @@ -1,3 +1,4 @@ +mod degrees; mod redirects; use std::io; @@ -8,6 +9,7 @@ use crate::data::Data; #[derive(Debug, clap::Parser)] enum Command { + Degrees(degrees::Cmd), Redirects(redirects::Cmd), } @@ -22,6 +24,7 @@ impl Cmd { pub fn run(self, data: Data) -> io::Result<()> { if let Some(cmd) = self.command { return match cmd { + Command::Degrees(cmd) => cmd.run(data), Command::Redirects(cmd) => cmd.run(data), }; } diff --git a/brood/src/commands/stats/degrees.rs b/brood/src/commands/stats/degrees.rs new file mode 100644 index 0000000..e73c0cf --- /dev/null +++ b/brood/src/commands/stats/degrees.rs @@ -0,0 +1,84 @@ +use std::{cmp::Reverse, io}; + +use crate::{ + algo, + data::{Data, Page}, + util, +}; + +/// Show stats on article in- and out-degrees. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[arg(long, short, default_value_t = 5)] + top: usize, +} + +impl Cmd { + pub fn run(self, mut data: Data) -> io::Result<()> { + println!(">> Outdegree"); + println!("> Counting links"); + let mut outdegree = vec![usize::MAX; data.pages.len()]; + for node in data.graph.nodes() { + outdegree[node.usize()] = data.graph.edge_range(node).len(); + } + + println!(">> Indegree"); + println!("> Inverting edges"); + algo::invert(&mut data); + let mut indegree = vec![usize::MAX; data.pages.len()]; + println!("> Counting links"); + for node in data.graph.nodes() { + indegree[node.usize()] = data.graph.edge_range(node).len(); + } + + let mut by_degrees = data + .pages + .iter() + .zip(outdegree) + .zip(indegree) + .map(|((p, od), id)| (p, od, id)) + .collect::>(); + + println!(); + println!("Most outlinks"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by_key(|(_, od, _)| Reverse(*od)); + self.print_links(&by_degrees); + + println!(); + println!("Most inlinks"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by_key(|(_, _, id)| Reverse(*id)); + self.print_links(&by_degrees); + + by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0); + + println!(); + println!("Most outlinks per non-zero inlink"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| { + let r1 = *od1 as f32 / *id1 as f32; + let r2 = *od2 as f32 / *id2 as f32; + r2.total_cmp(&r1) // Reverse order so max values are at beginnibg + }); + self.print_links(&by_degrees); + + println!(); + println!("Most inlinks per non-zero outlink"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.reverse(); + self.print_links(&by_degrees); + + Ok(()) + } + + fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) { + for (page, od, id) in by_degrees.iter().take(self.top) { + println!("{} ({od} out, {id} in)", util::fmt_page(page)); + } + } +} From 698b6590d1e27d4673d9517fd1bcd4f5c46b0416 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 18:27:18 +0100 Subject: [PATCH 31/36] Remove unused method --- brood/src/graph.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/brood/src/graph.rs b/brood/src/graph.rs index e7849fb..a869300 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -240,11 +240,6 @@ impl Graph { Edges::new(self) } - pub fn edges_for(&self, node: NodeIdx) -> impl Iterator + '_ { - self.edge_range(node) - .map(|i| (EdgeIdx::new(i), self.edges[i])) - } - pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { self.nodes .get(node.usize()) From 3045d6d6c63ef913300b5c437fcada2f6bb82689 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 18:33:50 +0100 Subject: [PATCH 32/36] Improve link degree formatting --- brood/src/commands/stats/degrees.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/brood/src/commands/stats/degrees.rs b/brood/src/commands/stats/degrees.rs index e73c0cf..5bd05be 100644 --- a/brood/src/commands/stats/degrees.rs +++ b/brood/src/commands/stats/degrees.rs @@ -1,5 +1,7 @@ use std::{cmp::Reverse, io}; +use thousands::Separable; + use crate::{ algo, data::{Data, Page}, @@ -77,8 +79,14 @@ impl Cmd { } fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) { - for (page, od, id) in by_degrees.iter().take(self.top) { - println!("{} ({od} out, {id} in)", util::fmt_page(page)); + for (i, (page, od, id)) in by_degrees.iter().take(self.top).enumerate() { + println!( + "{:3}. {} ({} out, {} in)", + i + 1, + util::fmt_page(page), + od.separate_with_underscores(), + id.separate_with_underscores() + ); } } } From b2a8597c6ff7021045dfd30075329a1312db5605 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 19:54:43 +0100 Subject: [PATCH 33/36] Add longest-path command --- brood/src/commands.rs | 1 + brood/src/commands/list_pages.rs | 23 --- brood/src/commands/longest_path.rs | 70 ++++++++ brood/src/commands/longest_shortest_path.rs | 173 -------------------- brood/src/main.rs | 6 +- 5 files changed, 75 insertions(+), 198 deletions(-) delete mode 100644 brood/src/commands/list_pages.rs create mode 100644 brood/src/commands/longest_path.rs delete mode 100644 brood/src/commands/longest_shortest_path.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6c9bd5e..9885f8a 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,5 +1,6 @@ pub mod export; pub mod ingest; +pub mod longest_path; pub mod path; pub mod show; pub mod stats; diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs deleted file mode 100644 index 5f659ea..0000000 --- a/brood/src/commands/list_pages.rs +++ /dev/null @@ -1,23 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::store; - -pub fn run(datafile: &Path) -> io::Result<()> { - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - for (page_idx, page) in data.pages() { - if page.data.redirect { - for link_idx in data.link_range(page_idx) { - let target_page = data.page(data.link(link_idx).to); - println!("{:?} -> {:?}", page.data.title, target_page.data.title); - } - } else { - println!("{:?}", page.data.title); - } - } - - Ok(()) -} diff --git a/brood/src/commands/longest_path.rs b/brood/src/commands/longest_path.rs new file mode 100644 index 0000000..1ac8e40 --- /dev/null +++ b/brood/src/commands/longest_path.rs @@ -0,0 +1,70 @@ +use std::io; + +use crate::{ + algo::Dijkstra, + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +/// Find the article with the longest shortest path away from the starting +/// article. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + start: String, + #[arg(long, short, default_value_t = 1)] + top: usize, +} + +fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { + let start = &data.pages[start.usize()].title; + let goal = &data.pages[goal.usize()].title; + + let Some((cost, path)) = path else { + println!("No path found from {start} to {goal}"); + return; + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + + for page in path { + println!("{}", util::fmt_page(&data.pages[page.usize()])); + } +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Resolve article"); + let start = util::resolve_title(&normalizer, &data, &self.start); + println!("Start: {}", data.pages[start.usize()].title); + + println!(">> Search paths"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&data.graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |_| false, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, + ); + + println!(">> Find longest paths"); + let mut costs = data + .graph + .nodes() + .map(|n| (dijkstra.cost(n), n)) + .filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please + .collect::>(); + costs.sort_unstable(); + + for (cost, goal) in costs.iter().rev().take(self.top) { + let path = dijkstra.path(*goal); + println!(); + print_path(&data, start, *goal, Some((*cost, path))); + } + + Ok(()) + } +} diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs deleted file mode 100644 index e15eb17..0000000 --- a/brood/src/commands/longest_shortest_path.rs +++ /dev/null @@ -1,173 +0,0 @@ -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::adjacency_list::AdjacencyList; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; - -struct DijkstraPageInfo { - cost: u32, - /// Index of the previous page. - prev: u32, - redirect: bool, -} - -impl DijkstraPageInfo { - fn from_page_info(info: PageInfo) -> Self { - Self { - cost: u32::MAX, - prev: u32::MAX, - redirect: info.redirect, - } - } -} - -struct DijkstraLinkInfo { - cost: u32, -} - -impl DijkstraLinkInfo { - fn from_link_info(info: LinkInfo) -> Self { - Self { - cost: 1, - // cost: 1000 + info.start, - // cost: 10000 + info.start, - // cost: 1000 + info.start / 10, - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct Entry { - cost: u32, - page_idx: u32, -} - -impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } - } -} - -// Manual implementation so the queue is a min-heap instead of a max-heap. -impl Ord for Entry { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .cost - .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) - } -} - -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn full_dijkstra( - data: AdjacencyList, - from: u32, -) -> AdjacencyList { - println!("> Prepare state"); - let mut data = data - .change_page_data(DijkstraPageInfo::from_page_info) - .change_link_data(DijkstraLinkInfo::from_link_info); - let mut queue = BinaryHeap::new(); - data.page_mut(from).data.cost = 0; - queue.push(Entry::new(0, from)); - - println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - let page = data.page(page_idx); - if cost > page.data.cost { - // This queue entry is outdated - continue; - } - - let redirect = page.data.redirect; - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - - let next = Entry { - cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, - }; - - let target_page = data.page_mut(link.to); - if next.cost < target_page.data.cost { - target_page.data.cost = next.cost; - target_page.data.prev = page_idx; - queue.push(next); - } - } - } - - data -} - -fn find_longest_shortest_path( - data: AdjacencyList, - from: u32, -) -> Option> { - let to = data - .pages - .iter() - .enumerate() - .filter(|(_, p)| p.data.cost != u32::MAX) - .max_by_key(|(_, p)| p.data.cost)? - .0 as u32; - - let mut steps = vec![]; - let mut at = to; - loop { - steps.push(at); - at = data.page(at).data.prev; - if at == u32::MAX { - break; - }; - } - steps.reverse(); - if steps.first() == Some(&from) { - Some(steps) - } else { - None - } -} - -pub fn run(datafile: &Path, from: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - let pages = data.pages.clone(); - - println!(">> Locate from and to"); - let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); - println!("From: {:?}", data.page(from_idx).data.title); - - println!(">> Find all shortest paths"); - let data = full_dijkstra(data, from_idx); - - println!(">> Find longest shortest path"); - let path = find_longest_shortest_path(data, from_idx); - - if let Some(path) = path { - println!("Path found:"); - for page_idx in path { - let page = &pages[page_idx as usize]; - if page.data.redirect { - println!(" v {:?}", page.data.title); - } else { - println!(" - {:?}", page.data.title); - } - } - } else { - println!("No path found"); - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index ba71e52..66b14df 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -14,8 +14,9 @@ enum Command { Ingest(commands::ingest::Cmd), Export(commands::export::Cmd), Show(commands::show::Cmd), - Path(commands::path::Cmd), Stats(commands::stats::Cmd), + Path(commands::path::Cmd), + LongestPath(commands::longest_path::Cmd), } #[derive(Debug, Parser)] @@ -73,7 +74,8 @@ fn main() -> io::Result<()> { Command::Ingest(_) => unreachable!(), Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), - Command::Path(cmd) => cmd.run(data), Command::Stats(cmd) => cmd.run(data), + Command::Path(cmd) => cmd.run(data), + Command::LongestPath(cmd) => cmd.run(data), } } From 8016bbfc83a1dde9dba3a6be9e55082d7296b899 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 1 Jan 2025 00:59:03 +0100 Subject: [PATCH 34/36] Port and rename pg command --- brood/src/commands.rs | 1 + brood/src/commands/pg.rs | 273 ++++++++++++++++++++++++++ brood/src/commands/philosophy_game.rs | 269 ------------------------- brood/src/main.rs | 2 + 4 files changed, 276 insertions(+), 269 deletions(-) create mode 100644 brood/src/commands/pg.rs delete mode 100644 brood/src/commands/philosophy_game.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 9885f8a..fbb29d7 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -2,5 +2,6 @@ pub mod export; pub mod ingest; pub mod longest_path; pub mod path; +pub mod pg; pub mod show; pub mod stats; diff --git a/brood/src/commands/pg.rs b/brood/src/commands/pg.rs new file mode 100644 index 0000000..a106a3a --- /dev/null +++ b/brood/src/commands/pg.rs @@ -0,0 +1,273 @@ +use std::{ + collections::{BTreeSet, HashMap, HashSet}, + io::{self, BufWriter}, +}; + +use crate::{ + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +struct PageMap(Vec); + +impl PageMap { + fn new(len: usize) -> Self { + Self(vec![NodeIdx::NONE; len]) + } + + fn get(&self, node: NodeIdx) -> NodeIdx { + self.0[node.usize()] + } + + fn set(&mut self, node: NodeIdx, to: NodeIdx) { + self.0[node.usize()] = to; + } +} + +fn first_viable_link(data: &Data, node: NodeIdx) -> Option { + for edge in data.graph.edge_slice(node) { + let link = &data.links[edge.usize()]; + if !link.in_parens() && !link.in_structure() { + return Some(*edge); + } + } + None +} + +fn find_forward_edges(data: &Data) -> PageMap { + let mut result = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + if let Some(first_link) = first_viable_link(data, node) { + result.set(node, first_link); + } + } + result +} + +fn find_clusters(data: &Data, forward: &PageMap) -> PageMap { + let mut cluster = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + let mut current = node; + let mut visited = HashSet::new(); + let canonical = loop { + // We've already determined the canonical element for this page. + if cluster.get(current) != NodeIdx::NONE { + break cluster.get(current); + } + + // We've hit a loop + if visited.contains(¤t) { + let mut loop_members = BTreeSet::new(); + while !loop_members.contains(¤t) { + loop_members.insert(current); + current = forward.get(current); + } + break loop_members.pop_first().unwrap(); + } + + visited.insert(current); + + let next = forward.get(current); + if next == NodeIdx::NONE { + // We've hit a dead-end + break current; + } + + current = next; + }; + + for i in visited { + cluster.set(i, canonical); + } + } + + cluster +} + +enum Cluster { + DeadEnd(NodeIdx), + Loop(Vec), +} + +fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { + let mut result = HashMap::new(); + for canonical in cluster.0.iter().copied().collect::>() { + if forward.get(canonical) == NodeIdx::NONE { + result.insert(canonical, Cluster::DeadEnd(canonical)); + continue; + } + + let mut members = vec![]; + let mut current = canonical; + loop { + members.push(current); + current = forward.get(current); + if current == canonical { + break; + } + } + result.insert(canonical, Cluster::Loop(members)); + } + + result +} + +fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> { + let map = forward + .0 + .iter() + .enumerate() + .map(|(node, first_link)| { + let page_title = &data.pages[node].title; + let first_link_title = if *first_link == NodeIdx::NONE { + None + } else { + Some(&data.pages[first_link.usize()].title) + }; + (page_title, first_link_title) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) { + let start_idx = util::resolve_title(normalizer, data, start); + + let mut current = start_idx; + let mut visited = HashSet::new(); + loop { + let page = &data.pages[current.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + + visited.insert(current); + + let next = forward.get(current); + + if next == NodeIdx::NONE { + println!("> dead-end reached"); + return; + } + + if visited.contains(&next) { + let page = &data.pages[next.usize()]; + let title = &page.title; + println!("> loop detected ({title})"); + return; + } + + current = next; + } +} + +fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> { + let map = cluster + .0 + .iter() + .enumerate() + .map(|(page, canonical)| { + ( + &data.pages[page].title, + &data.pages[canonical.usize()].title, + ) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +#[derive(Debug, PartialEq, Eq, clap::Parser)] +enum Command { + First, + Trace { start: String }, + Canonical, + Cluster, +} + +/// Show interesting stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[command(subcommand)] + command: Command, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + eprintln!(">> Forward"); + let forward = find_forward_edges(&data); + + match self.command { + Command::First => { + eprintln!(">> First links"); + print_forward_edges_as_json(&data, &forward)?; + return Ok(()); + } + Command::Trace { start } => { + eprintln!(">> Tracing"); + print_trace(&normalizer, &data, &forward, &start); + return Ok(()); + } + _ => {} + } + + // Determine cluster for each page, represented via canonical page. The + // canonical page of a cluster is either a dead-end or the loop member with + // the smallest index. + eprintln!(">> Find clusters"); + let cluster = find_clusters(&data, &forward); + + if self.command == Command::Canonical { + print_canonical_pages_as_json(&data, &cluster)?; + return Ok(()); + } + + // Measure cluster size + eprintln!(">> Measure clusters"); + let mut cluster_size = HashMap::::new(); + for (i, canonical) in cluster.0.iter().enumerate() { + assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title); + *cluster_size.entry(*canonical).or_default() += 1; + } + let mut cluster_by_size = cluster_size.into_iter().collect::>(); + cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); + cluster_by_size.reverse(); + + // Print clusters + assert!(self.command == Command::Cluster); + let resolved = resolve_clusters(&forward, &cluster); + for (canonical, size) in cluster_by_size { + match resolved.get(&canonical).unwrap() { + Cluster::DeadEnd(page) => { + let title = &data.pages[page.usize()].title; + println!("Cluster (dead-end, {size}): {title}"); + } + Cluster::Loop(pages) => { + println!("Cluster ({}-loop, {size}):", pages.len()); + for page in pages { + let page = &data.pages[page.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + } + } + } + } + + Ok(()) + } +} diff --git a/brood/src/commands/philosophy_game.rs b/brood/src/commands/philosophy_game.rs deleted file mode 100644 index b276bd5..0000000 --- a/brood/src/commands/philosophy_game.rs +++ /dev/null @@ -1,269 +0,0 @@ -use std::{ - collections::{BTreeSet, HashMap, HashSet}, - fs::File, - io::{self, BufReader, BufWriter}, - path::Path, -}; - -use crate::{ - data::{ - adjacency_list::AdjacencyList, - info::{LinkInfo, PageInfo}, - store, - }, - util, PhilosophyGameCmd, -}; - -struct PageMap(Vec); - -impl PageMap { - fn new(len: usize) -> Self { - Self(vec![u32::MAX; len]) - } - - fn get(&self, page_idx: u32) -> u32 { - self.0[page_idx as usize] - } - - fn set(&mut self, page_idx: u32, to: u32) { - self.0[page_idx as usize] = to; - } -} - -fn first_viable_link(data: &AdjacencyList, page_idx: u32) -> Option { - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - if !link.data.in_parens() && !link.data.in_structure() { - return Some(link.to); - } - } - None -} - -fn find_forward_edges(data: &AdjacencyList) -> PageMap { - let mut result = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - if let Some(first_link) = first_viable_link(data, page_idx) { - result.set(page_idx, first_link); - } - } - result -} - -fn find_clusters(data: &AdjacencyList, forward: &PageMap) -> PageMap { - let mut cluster = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - let mut current = page_idx; - let mut visited = HashSet::new(); - let canonical = loop { - // We've already determined the canonical element for this page. - if cluster.get(current) != u32::MAX { - break cluster.get(current); - } - - // We've hit a loop - if visited.contains(¤t) { - let mut loop_members = BTreeSet::new(); - while !loop_members.contains(¤t) { - loop_members.insert(current); - current = forward.get(current); - } - break loop_members.pop_first().unwrap(); - } - - visited.insert(current); - - let next = forward.get(current); - if next == u32::MAX { - // We've hit a dead-end - break current; - } - - current = next; - }; - - for i in visited { - cluster.set(i, canonical); - } - } - - cluster -} - -enum Cluster { - DeadEnd(u32), - Loop(Vec), -} - -fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { - let mut result = HashMap::new(); - for canonical in cluster.0.iter().copied().collect::>() { - if forward.get(canonical) == u32::MAX { - result.insert(canonical, Cluster::DeadEnd(canonical)); - continue; - } - - let mut members = vec![]; - let mut current = canonical; - loop { - members.push(current); - current = forward.get(current); - if current == canonical { - break; - } - } - result.insert(canonical, Cluster::Loop(members)); - } - - result -} - -fn print_forward_edges_as_json( - data: &AdjacencyList, - forward: &PageMap, -) -> io::Result<()> { - let map = forward - .0 - .iter() - .enumerate() - .map(|(page, first_link)| { - let page_title = &data.page(page as u32).data.title; - let first_link_title = if *first_link == u32::MAX { - None - } else { - Some(&data.page(*first_link).data.title) - }; - (page_title, first_link_title) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -fn print_trace(data: &AdjacencyList, forward: &PageMap, start: &str) { - let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start)); - - let mut current = start_idx; - let mut visited = HashSet::new(); - loop { - let page = data.page(current); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - - visited.insert(current); - - let next = forward.get(current); - - if next == u32::MAX { - println!("> dead-end reached"); - return; - } - - if visited.contains(&next) { - let page = data.page(next); - let title = &page.data.title; - println!("> loop detected ({title})"); - return; - } - - current = next; - } -} - -fn print_canonical_pages_as_json( - data: &AdjacencyList, - cluster: &PageMap, -) -> io::Result<()> { - let map = cluster - .0 - .iter() - .enumerate() - .map(|(page, canonical)| { - ( - &data.page(page as u32).data.title, - &data.page(*canonical).data.title, - ) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> { - eprintln!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - eprintln!(">> Forward"); - let forward = find_forward_edges(&data); - - match subcmd { - PhilosophyGameCmd::First => { - eprintln!(">> First links"); - print_forward_edges_as_json(&data, &forward)?; - return Ok(()); - } - PhilosophyGameCmd::Trace { start } => { - eprintln!(">> Tracing"); - print_trace(&data, &forward, &start); - return Ok(()); - } - _ => {} - } - - // Determine cluster for each page, represented via canonical page. The - // canonical page of a cluster is either a dead-end or the loop member with - // the smallest index. - eprintln!(">> Find clusters"); - let cluster = find_clusters(&data, &forward); - - if subcmd == PhilosophyGameCmd::Canonical { - print_canonical_pages_as_json(&data, &cluster)?; - return Ok(()); - } - - // Measure cluster size - eprintln!(">> Measure clusters"); - let mut cluster_size = HashMap::::new(); - for (i, canonical) in cluster.0.iter().enumerate() { - assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title); - *cluster_size.entry(*canonical).or_default() += 1; - } - let mut cluster_by_size = cluster_size.into_iter().collect::>(); - cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); - cluster_by_size.reverse(); - - // Print clusters - assert!(subcmd == PhilosophyGameCmd::Cluster); - let resolved = resolve_clusters(&forward, &cluster); - for (canonical, size) in cluster_by_size { - match resolved.get(&canonical).unwrap() { - Cluster::DeadEnd(page) => { - let title = &data.page(*page).data.title; - println!("Cluster (dead-end, {size}): {title}"); - } - Cluster::Loop(pages) => { - println!("Cluster ({}-loop, {size}):", pages.len()); - for page in pages { - let page = data.page(*page); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - } - } - } - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 66b14df..270aee8 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -17,6 +17,7 @@ enum Command { Stats(commands::stats::Cmd), Path(commands::path::Cmd), LongestPath(commands::longest_path::Cmd), + Pg(commands::pg::Cmd), } #[derive(Debug, Parser)] @@ -77,5 +78,6 @@ fn main() -> io::Result<()> { Command::Stats(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), Command::LongestPath(cmd) => cmd.run(data), + Command::Pg(cmd) => cmd.run(data), } } From d9fd29c1c3b9e93b7ce755926633366893972778 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 1 Jan 2025 15:59:39 +0100 Subject: [PATCH 35/36] Add progress bars to ingest command --- brood/Cargo.lock | 155 +++++++++++++++++++++++++++++++++++ brood/Cargo.toml | 1 + brood/src/commands/ingest.rs | 60 +++++++++----- 3 files changed, 195 insertions(+), 21 deletions(-) diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 414bb49..180ca5c 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -65,12 +65,25 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", + "indicatif", "regex", "serde", "serde_json", "thousands", ] +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "4.5.23" @@ -117,12 +130,44 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indicatif" +version = "0.17.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -135,12 +180,52 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "proc-macro2" version = "1.0.92" @@ -255,12 +340,82 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wasm-bindgen" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "windows-sys" version = "0.59.0" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 0dd4156..99890b6 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } +indicatif = "0.17.9" regex = "1.11.1" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 2036062..74f5663 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -5,15 +5,30 @@ use std::{ path::{Path, PathBuf}, }; +use indicatif::{ProgressBar, ProgressStyle}; use serde::Deserialize; use thousands::Separable; use crate::{ data::{Data, Link, Page}, graph::NodeIdx, - util::{Counter, TitleNormalizer}, + util::TitleNormalizer, }; +const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ "; + +fn seek_to_start(f: &mut BufReader) -> io::Result { + let size = f.seek(io::SeekFrom::End(0))?; + f.seek(io::SeekFrom::Start(0))?; + Ok(size) +} + +fn file_progress_style() -> ProgressStyle { + ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}") + .unwrap() + .progress_chars(PROGRESS_CHARS) +} + #[derive(Deserialize)] struct JsonPage { id: u32, @@ -23,17 +38,17 @@ struct JsonPage { redirect: Option, } -fn read_titles(r: &mut BufReader) -> io::Result> { - let mut counter = Counter::new(); +fn read_titles(f: &mut BufReader) -> io::Result> { + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); + let mut titles = vec![]; - for line in r.lines() { - counter.tick(); + for line in bar.wrap_read(f).lines() { let page = serde_json::from_str::(&line?).unwrap(); titles.push(page.title); } - counter.done(); Ok(titles) } @@ -49,12 +64,12 @@ fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], ) -> HashMap { - let mut counter = Counter::new(); let mut title_lookup = HashMap::::new(); - for (sift_i, title) in titles.iter().enumerate() { - counter.tick(); + let bar = ProgressBar::new(titles.len() as u64) + .with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS)); + for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() { // The index where this article will appear in the final list, assuming // it is not a duplicate. For ownership reasons, we compute this here // instead of inside the Entry::Vacant branch of the following match. @@ -68,31 +83,33 @@ fn compute_title_lookup( let prev_sift_i = entry.get().0; let prev = &titles[prev_sift_i as usize]; if prev == title { - println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}"); + bar.println(format!( + " {title:?} ({prev_sift_i}) occurs again at {sift_i}" + )); } else { - println!( - " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}", + bar.println(format!( + " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}", normalizer.normalize(title) - ); + )); } } } } - counter.done(); title_lookup } fn read_page_data( normalizer: &TitleNormalizer, title_lookup: &HashMap, - r: &mut BufReader, + f: &mut BufReader, ) -> io::Result { - let mut counter = Counter::new(); + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); + let mut data = Data::new(); - for (i, line) in r.lines().enumerate() { - counter.tick(); + for (i, line) in bar.wrap_read(f).lines().enumerate() { let page = serde_json::from_str::(&line?).unwrap(); let normalized = normalizer.normalize(&page.title); @@ -100,7 +117,10 @@ fn read_page_data( if i as u32 != sift_i { // Articles may occur multiple times, and this is not the instance // of the article we should keep. - println!(" Skipping {:?} ({i}) in favor of {sift_i}", page.title); + bar.println(format!( + " Skipping {:?} ({i}) in favor of {sift_i}", + page.title + )); continue; } @@ -127,7 +147,6 @@ fn read_page_data( } } - counter.done(); Ok(data) } @@ -153,7 +172,6 @@ impl Cmd { drop(titles); // Don't hoard memory println!(">> Second pass"); - sift_data.seek(io::SeekFrom::Start(0))?; println!("> Reading page data"); let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; From ee66509dd8c210e33203deb64dc97f002115b05f Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 3 Jan 2025 02:43:40 +0100 Subject: [PATCH 36/36] Remove now-obsolete Counter --- brood/src/util.rs | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/brood/src/util.rs b/brood/src/util.rs index bf42980..cc6ee42 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,46 +1,12 @@ -use std::{collections::HashSet, fmt, time::Instant}; +use std::{collections::HashSet, fmt}; use regex::Regex; -use thousands::Separable; use crate::{ data::{Data, Page}, graph::NodeIdx, }; -pub struct Counter { - n: usize, - last_print: Instant, -} - -impl Counter { - pub fn new() -> Self { - Self { - n: 0, - last_print: Instant::now(), - } - } - - pub fn tick(&mut self) { - self.n += 1; - if self.n % 10_000 != 0 { - return; - } - - let now = Instant::now(); - if now.duration_since(self.last_print).as_secs() < 4 { - return; - } - - println!("{:>12}", self.n.separate_with_underscores()); - self.last_print = now; - } - - pub fn done(&self) { - println!("{:>12} (done)", self.n.separate_with_underscores()); - } -} - // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js struct PhpCharToUpper(char);