diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 0162043..180ca5c 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -1,12 +1,21 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] [[package]] name = "anstream" -version = "0.6.14" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", @@ -19,33 +28,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.3" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", "windows-sys", @@ -56,16 +65,30 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", - "rustc-hash", + "indicatif", + "regex", "serde", "serde_json", + "thousands", ] [[package]] -name = "clap" -version = "4.5.7" +name = "bumpalo" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -73,9 +96,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.7" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -85,9 +108,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.5" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ "heck", "proc-macro2", @@ -97,15 +120,34 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] name = "heck" @@ -114,40 +156,122 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "is_terminal_polyfill" -version = "1.70.0" +name = "indicatif" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] [[package]] -name = "rustc-hash" -version = "2.0.0" +name = "regex" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "ryu" @@ -157,18 +281,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", @@ -177,11 +301,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.118" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -194,9 +319,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.68" +version = "2.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" dependencies = [ "proc-macro2", "quote", @@ -204,10 +329,22 @@ dependencies = [ ] [[package]] -name = "unicode-ident" -version = "1.0.12" +name = "thousands" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "utf8parse" @@ -216,19 +353,83 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] -name = "windows-sys" -version = "0.52.0" +name = "wasm-bindgen" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -242,48 +443,48 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 940f920..99890b6 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -4,7 +4,9 @@ version = "0.0.0" edition = "2021" [dependencies] -clap = { version = "4.5.7", features = ["derive", "deprecated"] } -rustc-hash = "2.0.0" -serde = { version = "1.0.203", features = ["derive"] } -serde_json = "1.0.118" +clap = { version = "4.5.23", features = ["derive", "deprecated"] } +indicatif = "0.17.9" +regex = "1.11.1" +serde = { version = "1.0.217", features = ["derive"] } +serde_json = "1.0.134" +thousands = "0.2.0" diff --git a/brood/src/algo.rs b/brood/src/algo.rs new file mode 100644 index 0000000..ac1919f --- /dev/null +++ b/brood/src/algo.rs @@ -0,0 +1,4 @@ +mod dijkstra; +mod edit; + +pub use self::{dijkstra::*, edit::*}; diff --git a/brood/src/algo/dijkstra.rs b/brood/src/algo/dijkstra.rs new file mode 100644 index 0000000..b6bf26a --- /dev/null +++ b/brood/src/algo/dijkstra.rs @@ -0,0 +1,77 @@ +use std::{cmp::Reverse, collections::BinaryHeap}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +pub struct Dijkstra<'a> { + graph: &'a Graph, + cost: Vec, + pred: Vec, +} + +impl<'a> Dijkstra<'a> { + pub fn new(graph: &'a Graph) -> Self { + Self { + graph, + cost: vec![u32::MAX; graph.nodes.len()], + pred: vec![NodeIdx::NONE; graph.nodes.len()], + } + } + + pub fn run( + &mut self, + start: NodeIdx, + goal: impl Fn(NodeIdx) -> bool, + cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, + ) { + self.cost[start.usize()] = 0; + let mut queue = BinaryHeap::new(); + queue.push((Reverse(0), start)); + + while let Some((Reverse(curr_cost), curr)) = queue.pop() { + if goal(curr) { + break; // We've found the shortest path to our target + } + + // These seem to never actually occur + // if curr_cost > self.cost[curr.usize()] { + // continue; // Outdated entry + // } + + for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { + let next = self.graph.edges[edge.usize()]; + let next_cost = curr_cost + cost(curr, edge, next); + if next_cost < self.cost[next.usize()] { + self.cost[next.usize()] = next_cost; + self.pred[next.usize()] = curr; + queue.push((Reverse(next_cost), next)); + } + } + } + } + + #[inline] + pub fn cost(&self, node: NodeIdx) -> u32 { + self.cost[node.usize()] + } + + #[inline] + pub fn pred(&self, node: NodeIdx) -> NodeIdx { + self.pred[node.usize()] + } + + pub fn path(&self, goal: NodeIdx) -> Vec { + let mut path = vec![]; + let mut at = goal; + + loop { + path.push(at); + at = self.pred(at); + if at == NodeIdx::NONE { + break; + } + } + + path.reverse(); + path + } +} diff --git a/brood/src/algo/edit.rs b/brood/src/algo/edit.rs new file mode 100644 index 0000000..2be0c0a --- /dev/null +++ b/brood/src/algo/edit.rs @@ -0,0 +1,97 @@ +use std::mem; + +use crate::{ + data::{Data, Link}, + graph::NodeIdx, + util, +}; + +pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) { + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + if f(&link) { + data.links.push(link); + data.graph.add_edge(*edge); + } + } + } +} + +pub fn resolve_redirects(data: &mut Data) { + // Permutation from input node to input node + let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()]; + for node in data.graph.nodes() { + perm_redirect[node.usize()] = util::resolve_redirects(data, node); + } + + // Permutation from input node to final node + let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()]; + let mut perm_retain_count = NodeIdx(0); + for (i, page) in data.pages.iter().enumerate() { + if !page.redirect { + perm_retain[i] = perm_retain_count; + perm_retain_count += 1; + } + } + + let mut pages = mem::take(&mut data.pages).into_iter(); + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + let page = pages.next().unwrap(); + let new_node = perm_retain[node.usize()]; + + if new_node == NodeIdx::NONE { + // Skip all edges + for _ in graph.edge_slice(node) { + links.next().unwrap(); + } + continue; + } + + data.pages.push(page); + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + let new_edge = perm_retain[perm_redirect[edge.usize()].usize()]; + + if new_edge == NodeIdx::NONE { + continue; + } + + data.links.push(link); + data.graph.add_edge(new_edge); + } + } +} + +pub fn invert(data: &mut Data) { + let links = mem::take(&mut data.links); + let graph = mem::take(&mut data.graph); + + let mut edges = graph + .edges() + .zip(links) + .map(|((source, target), link)| (source, target, link)) + .collect::>(); + + edges.sort_by_key(|(_, target, _)| *target); + + let mut edges = edges.into_iter().peekable(); + for node in graph.nodes() { + data.graph.add_node(); + while edges.peek().is_some_and(|(_, target, _)| *target <= node) { + let (source, _, link) = edges.next().unwrap(); + data.graph.add_edge(source); + data.links.push(link); + } + } +} diff --git a/brood/src/commands.rs b/brood/src/commands.rs index ffff9d3..fbb29d7 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,6 +1,7 @@ +pub mod export; pub mod ingest; -pub mod list_pages; -pub mod longest_shortest_path; +pub mod longest_path; pub mod path; -pub mod philosophy_game; -pub mod reexport; +pub mod pg; +pub mod show; +pub mod stats; diff --git a/brood/src/commands/export.rs b/brood/src/commands/export.rs new file mode 100644 index 0000000..aad5dd8 --- /dev/null +++ b/brood/src/commands/export.rs @@ -0,0 +1,17 @@ +use std::{io, path::PathBuf}; + +use crate::data::Data; + +#[derive(Debug, clap::Parser)] +pub struct Cmd { + out: PathBuf, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + println!(">> Export"); + data.write_to_file(&self.out)?; + + Ok(()) + } +} diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index cda10d0..74f5663 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,16 +1,33 @@ -use std::collections::hash_map::Entry; -use std::fs::File; -use std::io::{self, BufRead, BufReader, BufWriter}; -use std::path::Path; -use std::u32; +use std::{ + collections::{hash_map::Entry, HashMap}, + fs::File, + io::{self, BufRead, BufReader, Seek}, + path::{Path, PathBuf}, +}; -use rustc_hash::FxHashMap; +use indicatif::{ProgressBar, ProgressStyle}; use serde::Deserialize; +use thousands::Separable; -use crate::data::adjacency_list::{AdjacencyList, Page}; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; +use crate::{ + data::{Data, Link, Page}, + graph::NodeIdx, + util::TitleNormalizer, +}; + +const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ "; + +fn seek_to_start(f: &mut BufReader) -> io::Result { + let size = f.seek(io::SeekFrom::End(0))?; + f.seek(io::SeekFrom::Start(0))?; + Ok(size) +} + +fn file_progress_style() -> ProgressStyle { + ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}") + .unwrap() + .progress_chars(PROGRESS_CHARS) +} #[derive(Deserialize)] struct JsonPage { @@ -21,151 +38,161 @@ struct JsonPage { redirect: Option, } -/* -Importing is a tad complicated because of multiple criteria: +fn read_titles(f: &mut BufReader) -> io::Result> { + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); -1. The data must be read in a single pass on stdin -2. The process should not consume a lot of memory - (can't store the decoded json data directly) -3. The process should result in a nice and compact adjacency list format + let mut titles = vec![]; -Because of this, the import is a bit more complex and has two passes. + for line in bar.wrap_read(f).lines() { + let page = serde_json::from_str::(&line?).unwrap(); + titles.push(page.title); + } -The first pass imports the data into an adjacency-list-like format, but the -`Link::to` field points to a title in `Titles` instead of a page. - -The second pass then resolves the links to page indices and throws away all -links that don't point to any known page. -*/ - -#[derive(Default)] -struct Titles { - /// Normalized titles - titles: Vec, - /// Map from normalized title to index in [`Self::titles`]. - map: FxHashMap, + Ok(titles) } -impl Titles { - fn insert(&mut self, title: String) -> u32 { - match self.map.entry(title.clone()) { - Entry::Occupied(occupied) => *occupied.get(), - Entry::Vacant(vacant) => { - let idx = self.titles.len() as u32; - self.titles.push(title); - vacant.insert(idx); - idx +/// Returns a map from normalized title to the index in the brood data where the +/// article will appear. +/// +/// Titles in the title list are not always unique. When multiple identical +/// titles appear, all but one have to be discarded. Originally, I tried to be +/// smart and keep the last occurrence (under the assumption that its data would +/// be the newest), but this led to index-based bugs. Because of this, I now +/// keep the first occurrence. +fn compute_title_lookup( + normalizer: &TitleNormalizer, + titles: &[String], +) -> HashMap { + let mut title_lookup = HashMap::::new(); + + let bar = ProgressBar::new(titles.len() as u64) + .with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS)); + + for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() { + // The index where this article will appear in the final list, assuming + // it is not a duplicate. For ownership reasons, we compute this here + // instead of inside the Entry::Vacant branch of the following match. + let brood_i = title_lookup.len(); + + match title_lookup.entry(normalizer.normalize(title)) { + Entry::Vacant(entry) => { + entry.insert((sift_i as u32, brood_i as u32)); + } + Entry::Occupied(entry) => { + let prev_sift_i = entry.get().0; + let prev = &titles[prev_sift_i as usize]; + if prev == title { + bar.println(format!( + " {title:?} ({prev_sift_i}) occurs again at {sift_i}" + )); + } else { + bar.println(format!( + " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}", + normalizer.normalize(title) + )); + } } } } - fn get(&self, i: u32) -> &str { - &self.titles[i as usize] - } + title_lookup } -fn first_stage() -> io::Result<(AdjacencyList, Titles)> { - let mut titles = Titles::default(); - let mut result = AdjacencyList::default(); +fn read_page_data( + normalizer: &TitleNormalizer, + title_lookup: &HashMap, + f: &mut BufReader, +) -> io::Result { + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); - let stdin = BufReader::new(io::stdin()); - for (i, line) in stdin.lines().enumerate() { - let json_page = serde_json::from_str::(&line?).unwrap(); + let mut data = Data::new(); - result.push_page(PageInfo { - id: json_page.id, - length: json_page.length, - redirect: json_page.redirect.is_some(), - title: json_page.title, + for (i, line) in bar.wrap_read(f).lines().enumerate() { + let page = serde_json::from_str::(&line?).unwrap(); + let normalized = normalizer.normalize(&page.title); + + let (sift_i, _) = title_lookup[&normalized]; + if i as u32 != sift_i { + // Articles may occur multiple times, and this is not the instance + // of the article we should keep. + bar.println(format!( + " Skipping {:?} ({i}) in favor of {sift_i}", + page.title + )); + continue; + } + + data.graph.add_node(); + data.pages.push(Page { + id: page.id, + title: page.title, + length: page.length, + redirect: page.redirect.is_some(), }); - if let Some(to) = json_page.redirect { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo::default()); - } else { - for (to, start, len, flags) in json_page.links { - let to = titles.insert(util::normalize_link(&to)); - result.push_link(to, LinkInfo { start, len, flags }); - } + let mut page_links = page.links; + if let Some(target) = page.redirect { + page_links.clear(); + let len = target.len() as u32; + page_links.push((target, 0, len, 0)); } - if (i + 1) % 100_000 == 0 { - eprintln!("{} pages imported", i + 1) - } - } - - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Titles: {}", titles.titles.len()); - eprintln!("Title map entries: {}", titles.map.len()); - - Ok((result, titles)) -} - -/// Create map from normalized title to index in pages. -fn initialize_pages_map(pages: &[Page]) -> FxHashMap { - let mut result = FxHashMap::default(); - for (i, p) in pages.iter().enumerate() { - match result.entry(util::normalize_link(&p.data.title)) { - Entry::Occupied(entry) => { - eprintln!( - "{:?} already exists at index {} as {:?}", - p.data.title, - entry.get(), - util::normalize_link(&p.data.title) - ); - } - Entry::Vacant(entry) => { - entry.insert(i as u32); + for (target, start, len, flags) in page_links { + if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { + data.graph.add_edge(NodeIdx(*brood_i)); + data.links.push(Link { start, len, flags }); } } } - result + + Ok(data) } -fn second_stage( - first_stage: &AdjacencyList, - titles: &Titles, -) -> AdjacencyList { - let pages_map = initialize_pages_map(&first_stage.pages); - let mut result = AdjacencyList::default(); +/// Convert sift data to brood data. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + /// The sift data file to ingest. + data: PathBuf, +} - for (page_idx, page) in first_stage.pages() { - result.push_page(page.data.clone()); +impl Cmd { + pub fn run(&self, brood_data: &Path) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); - for (_, link) in first_stage.links(page_idx) { - let title = util::normalize_link(titles.get(link.to)); - if let Some(to) = pages_map.get(&title) { - // The link points to an existing article, we should keep it - result.push_link(*to, link.data); - } - } + println!(">> First pass"); + let mut sift_data = BufReader::new(File::open(&self.data)?); - if (page_idx + 1) % 100_000 == 0 { - eprintln!("{} pages imported", page_idx + 1) - } + println!("> Reading titles"); + let titles = read_titles(&mut sift_data)?; + + println!("> Computing title index lookup table"); + let title_lookup = compute_title_lookup(&normalizer, &titles); + drop(titles); // Don't hoard memory + + println!(">> Second pass"); + + println!("> Reading page data"); + let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + assert_eq!(data.pages.len(), title_lookup.len()); + drop(title_lookup); // Don't hoard memory + drop(sift_data); // No longer needed + + println!("> Checking consistency"); + data.check_consistency(); + + println!(">> Export"); + println!( + "Pages: {:>13}", + data.pages.len().separate_with_underscores() + ); + println!( + "Links: {:>13}", + data.links.len().separate_with_underscores() + ); + data.write_to_file(brood_data)?; + + Ok(()) } - - eprintln!("Pages: {}", result.pages.len()); - eprintln!("Links: {}", result.links.len()); - eprintln!("Page map entries: {}", pages_map.len()); - - result -} - -pub fn ingest(datafile: &Path) -> io::Result<()> { - eprintln!(">> First stage"); - let (first_stage, titles) = first_stage()?; - - eprintln!(">> Second stage"); - let data = second_stage(&first_stage, &titles); - - eprintln!(">> Consistency check"); - data.check_consistency(); - - eprintln!(">> Export"); - let mut datafile = BufWriter::new(File::create(datafile)?); - store::write_adjacency_list(&data, &mut datafile)?; - - Ok(()) } diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs deleted file mode 100644 index 5f659ea..0000000 --- a/brood/src/commands/list_pages.rs +++ /dev/null @@ -1,23 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::store; - -pub fn run(datafile: &Path) -> io::Result<()> { - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - for (page_idx, page) in data.pages() { - if page.data.redirect { - for link_idx in data.link_range(page_idx) { - let target_page = data.page(data.link(link_idx).to); - println!("{:?} -> {:?}", page.data.title, target_page.data.title); - } - } else { - println!("{:?}", page.data.title); - } - } - - Ok(()) -} diff --git a/brood/src/commands/longest_path.rs b/brood/src/commands/longest_path.rs new file mode 100644 index 0000000..1ac8e40 --- /dev/null +++ b/brood/src/commands/longest_path.rs @@ -0,0 +1,70 @@ +use std::io; + +use crate::{ + algo::Dijkstra, + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +/// Find the article with the longest shortest path away from the starting +/// article. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + start: String, + #[arg(long, short, default_value_t = 1)] + top: usize, +} + +fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { + let start = &data.pages[start.usize()].title; + let goal = &data.pages[goal.usize()].title; + + let Some((cost, path)) = path else { + println!("No path found from {start} to {goal}"); + return; + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + + for page in path { + println!("{}", util::fmt_page(&data.pages[page.usize()])); + } +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Resolve article"); + let start = util::resolve_title(&normalizer, &data, &self.start); + println!("Start: {}", data.pages[start.usize()].title); + + println!(">> Search paths"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&data.graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |_| false, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, + ); + + println!(">> Find longest paths"); + let mut costs = data + .graph + .nodes() + .map(|n| (dijkstra.cost(n), n)) + .filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please + .collect::>(); + costs.sort_unstable(); + + for (cost, goal) in costs.iter().rev().take(self.top) { + let path = dijkstra.path(*goal); + println!(); + print_path(&data, start, *goal, Some((*cost, path))); + } + + Ok(()) + } +} diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs deleted file mode 100644 index e15eb17..0000000 --- a/brood/src/commands/longest_shortest_path.rs +++ /dev/null @@ -1,173 +0,0 @@ -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::adjacency_list::AdjacencyList; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; - -struct DijkstraPageInfo { - cost: u32, - /// Index of the previous page. - prev: u32, - redirect: bool, -} - -impl DijkstraPageInfo { - fn from_page_info(info: PageInfo) -> Self { - Self { - cost: u32::MAX, - prev: u32::MAX, - redirect: info.redirect, - } - } -} - -struct DijkstraLinkInfo { - cost: u32, -} - -impl DijkstraLinkInfo { - fn from_link_info(info: LinkInfo) -> Self { - Self { - cost: 1, - // cost: 1000 + info.start, - // cost: 10000 + info.start, - // cost: 1000 + info.start / 10, - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct Entry { - cost: u32, - page_idx: u32, -} - -impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } - } -} - -// Manual implementation so the queue is a min-heap instead of a max-heap. -impl Ord for Entry { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .cost - .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) - } -} - -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn full_dijkstra( - data: AdjacencyList, - from: u32, -) -> AdjacencyList { - println!("> Prepare state"); - let mut data = data - .change_page_data(DijkstraPageInfo::from_page_info) - .change_link_data(DijkstraLinkInfo::from_link_info); - let mut queue = BinaryHeap::new(); - data.page_mut(from).data.cost = 0; - queue.push(Entry::new(0, from)); - - println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - let page = data.page(page_idx); - if cost > page.data.cost { - // This queue entry is outdated - continue; - } - - let redirect = page.data.redirect; - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - - let next = Entry { - cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, - }; - - let target_page = data.page_mut(link.to); - if next.cost < target_page.data.cost { - target_page.data.cost = next.cost; - target_page.data.prev = page_idx; - queue.push(next); - } - } - } - - data -} - -fn find_longest_shortest_path( - data: AdjacencyList, - from: u32, -) -> Option> { - let to = data - .pages - .iter() - .enumerate() - .filter(|(_, p)| p.data.cost != u32::MAX) - .max_by_key(|(_, p)| p.data.cost)? - .0 as u32; - - let mut steps = vec![]; - let mut at = to; - loop { - steps.push(at); - at = data.page(at).data.prev; - if at == u32::MAX { - break; - }; - } - steps.reverse(); - if steps.first() == Some(&from) { - Some(steps) - } else { - None - } -} - -pub fn run(datafile: &Path, from: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - let pages = data.pages.clone(); - - println!(">> Locate from and to"); - let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); - println!("From: {:?}", data.page(from_idx).data.title); - - println!(">> Find all shortest paths"); - let data = full_dijkstra(data, from_idx); - - println!(">> Find longest shortest path"); - let path = find_longest_shortest_path(data, from_idx); - - if let Some(path) = path { - println!("Path found:"); - for page_idx in path { - let page = &pages[page_idx as usize]; - if page.data.redirect { - println!(" v {:?}", page.data.title); - } else { - println!(" - {:?}", page.data.title); - } - } - } else { - println!("No path found"); - } - - Ok(()) -} diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 82079d2..4f58bb6 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,159 +1,87 @@ -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; +use std::io; -use crate::data::adjacency_list::AdjacencyList; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; +use crate::{ + algo::Dijkstra, + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; -struct DijkstraPageInfo { - cost: u32, - prev: u32, - redirect: bool, +/// Find the shortest path between two articles. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + start: String, + goal: String, + + // Search for a path in both directions. + #[arg(long, short)] + bidi: bool, } -impl DijkstraPageInfo { - fn from_page_info(info: PageInfo) -> Self { - Self { - cost: u32::MAX, - prev: u32::MAX, - redirect: info.redirect, - } +fn search_path(data: &Data, start: NodeIdx, goal: NodeIdx) -> Option<(u32, Vec)> { + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&data.graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, + ); + + if dijkstra.cost(goal) == u32::MAX { + return None; + } + + println!("> Collecting path"); + let cost = dijkstra.cost(goal); + let path = dijkstra.path(goal); + Some((cost, path)) +} + +fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { + let start = &data.pages[start.usize()].title; + let goal = &data.pages[goal.usize()].title; + + let Some((cost, path)) = path else { + println!("No path found from {start} to {goal}"); + return; + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + + for page in path { + println!("{}", util::fmt_page(&data.pages[page.usize()])); } } -struct DijkstraLinkInfo { - cost: u32, -} +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); -impl DijkstraLinkInfo { - fn from_link_info(info: LinkInfo) -> Self { - Self { - cost: 1, - // cost: 1000 + info.start, - // cost: 10000 + info.start, - // cost: 1000 + info.start / 10, - } - } -} + println!(">> Resolve articles"); + let start = util::resolve_title(&normalizer, &data, &self.start); + let goal = util::resolve_title(&normalizer, &data, &self.goal); + println!("Start: {}", data.pages[start.usize()].title); + println!("Goal: {}", data.pages[goal.usize()].title); -#[derive(Clone, Copy, PartialEq, Eq)] -struct Entry { - cost: u32, - page_idx: u32, -} + if self.bidi { + println!(">> Find path forward"); + let forward = search_path(&data, start, goal); + println!(">> Find path backward"); + let backward = search_path(&data, goal, start); -impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } - } -} + println!(); + print_path(&data, start, goal, forward); + println!(); + print_path(&data, goal, start, backward); + } else { + println!(">> Find path"); + let path = search_path(&data, start, goal); -// Manual implementation so the queue is a min-heap instead of a max-heap. -impl Ord for Entry { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .cost - .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) - } -} - -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn dijkstra(data: AdjacencyList, from: u32, to: u32) -> Option> { - println!("> Prepare state"); - let mut data = data - .change_page_data(DijkstraPageInfo::from_page_info) - .change_link_data(DijkstraLinkInfo::from_link_info); - let mut queue = BinaryHeap::new(); - data.page_mut(from).data.cost = 0; - queue.push(Entry::new(0, from)); - - println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - if page_idx == to { - // We've found the shortest path to our target - break; + println!(); + print_path(&data, start, goal, path); } - let page = data.page(page_idx); - if cost > page.data.cost { - // This queue entry is outdated - continue; - } - - let redirect = page.data.redirect; - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - - let next = Entry { - cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, - }; - - let target_page = data.page_mut(link.to); - if next.cost < target_page.data.cost { - target_page.data.cost = next.cost; - target_page.data.prev = page_idx; - queue.push(next); - } - } - } - - println!("> Collect results"); - let mut steps = vec![]; - let mut at = to; - loop { - steps.push(at); - at = data.page(at).data.prev; - if at == u32::MAX { - break; - }; - } - steps.reverse(); - if steps.first() == Some(&from) { - Some(steps) - } else { - None + Ok(()) } } - -pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - let pages = data.pages.clone(); - - println!(">> Locate from and to"); - let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); - let to_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, to)); - println!("From: {:?}", data.page(from_idx).data.title); - println!("To: {:?}", data.page(to_idx).data.title); - - println!(">> Find path"); - let path = dijkstra(data, from_idx, to_idx); - - if let Some(path) = path { - println!("Path found:"); - for page_idx in path { - let page = &pages[page_idx as usize]; - if page.data.redirect { - println!(" v {:?}", page.data.title); - } else { - println!(" - {:?}", page.data.title); - } - } - } else { - println!("No path found"); - } - - Ok(()) -} diff --git a/brood/src/commands/pg.rs b/brood/src/commands/pg.rs new file mode 100644 index 0000000..a106a3a --- /dev/null +++ b/brood/src/commands/pg.rs @@ -0,0 +1,273 @@ +use std::{ + collections::{BTreeSet, HashMap, HashSet}, + io::{self, BufWriter}, +}; + +use crate::{ + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +struct PageMap(Vec); + +impl PageMap { + fn new(len: usize) -> Self { + Self(vec![NodeIdx::NONE; len]) + } + + fn get(&self, node: NodeIdx) -> NodeIdx { + self.0[node.usize()] + } + + fn set(&mut self, node: NodeIdx, to: NodeIdx) { + self.0[node.usize()] = to; + } +} + +fn first_viable_link(data: &Data, node: NodeIdx) -> Option { + for edge in data.graph.edge_slice(node) { + let link = &data.links[edge.usize()]; + if !link.in_parens() && !link.in_structure() { + return Some(*edge); + } + } + None +} + +fn find_forward_edges(data: &Data) -> PageMap { + let mut result = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + if let Some(first_link) = first_viable_link(data, node) { + result.set(node, first_link); + } + } + result +} + +fn find_clusters(data: &Data, forward: &PageMap) -> PageMap { + let mut cluster = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + let mut current = node; + let mut visited = HashSet::new(); + let canonical = loop { + // We've already determined the canonical element for this page. + if cluster.get(current) != NodeIdx::NONE { + break cluster.get(current); + } + + // We've hit a loop + if visited.contains(¤t) { + let mut loop_members = BTreeSet::new(); + while !loop_members.contains(¤t) { + loop_members.insert(current); + current = forward.get(current); + } + break loop_members.pop_first().unwrap(); + } + + visited.insert(current); + + let next = forward.get(current); + if next == NodeIdx::NONE { + // We've hit a dead-end + break current; + } + + current = next; + }; + + for i in visited { + cluster.set(i, canonical); + } + } + + cluster +} + +enum Cluster { + DeadEnd(NodeIdx), + Loop(Vec), +} + +fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { + let mut result = HashMap::new(); + for canonical in cluster.0.iter().copied().collect::>() { + if forward.get(canonical) == NodeIdx::NONE { + result.insert(canonical, Cluster::DeadEnd(canonical)); + continue; + } + + let mut members = vec![]; + let mut current = canonical; + loop { + members.push(current); + current = forward.get(current); + if current == canonical { + break; + } + } + result.insert(canonical, Cluster::Loop(members)); + } + + result +} + +fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> { + let map = forward + .0 + .iter() + .enumerate() + .map(|(node, first_link)| { + let page_title = &data.pages[node].title; + let first_link_title = if *first_link == NodeIdx::NONE { + None + } else { + Some(&data.pages[first_link.usize()].title) + }; + (page_title, first_link_title) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) { + let start_idx = util::resolve_title(normalizer, data, start); + + let mut current = start_idx; + let mut visited = HashSet::new(); + loop { + let page = &data.pages[current.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + + visited.insert(current); + + let next = forward.get(current); + + if next == NodeIdx::NONE { + println!("> dead-end reached"); + return; + } + + if visited.contains(&next) { + let page = &data.pages[next.usize()]; + let title = &page.title; + println!("> loop detected ({title})"); + return; + } + + current = next; + } +} + +fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> { + let map = cluster + .0 + .iter() + .enumerate() + .map(|(page, canonical)| { + ( + &data.pages[page].title, + &data.pages[canonical.usize()].title, + ) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +#[derive(Debug, PartialEq, Eq, clap::Parser)] +enum Command { + First, + Trace { start: String }, + Canonical, + Cluster, +} + +/// Show interesting stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[command(subcommand)] + command: Command, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + eprintln!(">> Forward"); + let forward = find_forward_edges(&data); + + match self.command { + Command::First => { + eprintln!(">> First links"); + print_forward_edges_as_json(&data, &forward)?; + return Ok(()); + } + Command::Trace { start } => { + eprintln!(">> Tracing"); + print_trace(&normalizer, &data, &forward, &start); + return Ok(()); + } + _ => {} + } + + // Determine cluster for each page, represented via canonical page. The + // canonical page of a cluster is either a dead-end or the loop member with + // the smallest index. + eprintln!(">> Find clusters"); + let cluster = find_clusters(&data, &forward); + + if self.command == Command::Canonical { + print_canonical_pages_as_json(&data, &cluster)?; + return Ok(()); + } + + // Measure cluster size + eprintln!(">> Measure clusters"); + let mut cluster_size = HashMap::::new(); + for (i, canonical) in cluster.0.iter().enumerate() { + assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title); + *cluster_size.entry(*canonical).or_default() += 1; + } + let mut cluster_by_size = cluster_size.into_iter().collect::>(); + cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); + cluster_by_size.reverse(); + + // Print clusters + assert!(self.command == Command::Cluster); + let resolved = resolve_clusters(&forward, &cluster); + for (canonical, size) in cluster_by_size { + match resolved.get(&canonical).unwrap() { + Cluster::DeadEnd(page) => { + let title = &data.pages[page.usize()].title; + println!("Cluster (dead-end, {size}): {title}"); + } + Cluster::Loop(pages) => { + println!("Cluster ({}-loop, {size}):", pages.len()); + for page in pages { + let page = &data.pages[page.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + } + } + } + } + + Ok(()) + } +} diff --git a/brood/src/commands/philosophy_game.rs b/brood/src/commands/philosophy_game.rs deleted file mode 100644 index 178df1d..0000000 --- a/brood/src/commands/philosophy_game.rs +++ /dev/null @@ -1,267 +0,0 @@ -use std::{ - collections::{BTreeSet, HashMap, HashSet}, - fs::File, - io::{self, BufReader, BufWriter}, - path::Path, -}; - -use crate::{ - data::{ - adjacency_list::AdjacencyList, - info::{LinkInfo, PageInfo}, - store, - }, - util, PhilosophyGameCmd, -}; - -struct PageMap(Vec); - -impl PageMap { - fn new(len: usize) -> Self { - Self(vec![u32::MAX; len]) - } - - fn get(&self, page_idx: u32) -> u32 { - self.0[page_idx as usize] - } - - fn set(&mut self, page_idx: u32, to: u32) { - self.0[page_idx as usize] = to; - } -} - -fn first_viable_link(data: &AdjacencyList, page_idx: u32) -> Option { - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - if !link.data.in_parens() && !link.data.in_structure() { - return Some(link.to); - } - } - None -} - -fn find_forward_edges(data: &AdjacencyList) -> PageMap { - let mut result = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - if let Some(first_link) = first_viable_link(data, page_idx) { - result.set(page_idx, first_link); - } - } - result -} - -fn find_clusters(data: &AdjacencyList, forward: &PageMap) -> PageMap { - let mut cluster = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - let mut current = page_idx; - let mut visited = HashSet::new(); - let canonical = loop { - // We've already determined the canonical element for this page. - if cluster.get(current) != u32::MAX { - break cluster.get(current); - } - - // We've hit a loop - if visited.contains(¤t) { - let mut loop_members = BTreeSet::new(); - while !loop_members.contains(¤t) { - loop_members.insert(current); - current = forward.get(current); - } - break loop_members.pop_first().unwrap(); - } - - visited.insert(current); - - let next = forward.get(current); - if next == u32::MAX { - // We've hit a dead-end - break current; - } - - current = next; - }; - - for i in visited { - cluster.set(i, canonical); - } - } - - cluster -} - -enum Cluster { - DeadEnd(u32), - Loop(Vec), -} - -fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { - let mut result = HashMap::new(); - for canonical in cluster.0.iter().copied().collect::>() { - if forward.get(canonical) == u32::MAX { - result.insert(canonical, Cluster::DeadEnd(canonical)); - continue; - } - - let mut members = vec![]; - let mut current = canonical; - loop { - members.push(current); - current = forward.get(current); - if current == canonical { - break; - } - } - result.insert(canonical, Cluster::Loop(members)); - } - - result -} - -fn print_forward_edges_as_json( - data: &AdjacencyList, - forward: &PageMap, -) -> io::Result<()> { - let map = forward - .0 - .iter() - .enumerate() - .map(|(page, first_link)| { - let page_title = &data.page(page as u32).data.title; - let first_link_title = if *first_link == u32::MAX { - None - } else { - Some(&data.page(*first_link).data.title) - }; - (page_title, first_link_title) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -fn print_trace(data: &AdjacencyList, forward: &PageMap, start: &str) { - let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start)); - - let mut current = start_idx; - let mut visited = HashSet::new(); - loop { - let page = data.page(current); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - - visited.insert(current); - - let next = forward.get(current); - - if next == u32::MAX { - println!("dead-end reached"); - return; - } - - if visited.contains(&next) { - println!("loop detected"); - return; - } - - current = next; - } -} - -fn print_canonical_pages_as_json( - data: &AdjacencyList, - cluster: &PageMap, -) -> io::Result<()> { - let map = cluster - .0 - .iter() - .enumerate() - .map(|(page, canonical)| { - ( - &data.page(page as u32).data.title, - &data.page(*canonical).data.title, - ) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> { - eprintln!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - eprintln!(">> Forward"); - let forward = find_forward_edges(&data); - - match subcmd { - PhilosophyGameCmd::First => { - eprintln!(">> First links"); - print_forward_edges_as_json(&data, &forward)?; - return Ok(()); - } - PhilosophyGameCmd::Trace { start } => { - eprintln!(">> Tracing"); - print_trace(&data, &forward, &start); - return Ok(()); - } - _ => {} - } - - // Determine cluster for each page, represented via canonical page. The - // canonical page of a cluster is either a dead-end or the loop member with - // the smallest index. - eprintln!(">> Find clusters"); - let cluster = find_clusters(&data, &forward); - - if subcmd == PhilosophyGameCmd::Canonical { - print_canonical_pages_as_json(&data, &cluster)?; - return Ok(()); - } - - // Measure cluster size - eprintln!(">> Measure clusters"); - let mut cluster_size = HashMap::::new(); - for (i, canonical) in cluster.0.iter().enumerate() { - assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title); - *cluster_size.entry(*canonical).or_default() += 1; - } - let mut cluster_by_size = cluster_size.into_iter().collect::>(); - cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); - cluster_by_size.reverse(); - - // Print clusters - assert!(subcmd == PhilosophyGameCmd::Cluster); - let resolved = resolve_clusters(&forward, &cluster); - for (canonical, size) in cluster_by_size { - match resolved.get(&canonical).unwrap() { - Cluster::DeadEnd(page) => { - let title = &data.page(*page).data.title; - println!("Cluster (dead-end, {size}): {title}"); - } - Cluster::Loop(pages) => { - println!("Cluster ({}-loop, {size}):", pages.len()); - for page in pages { - let page = data.page(*page); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - } - } - } - } - - Ok(()) -} diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs deleted file mode 100644 index 1125fb0..0000000 --- a/brood/src/commands/reexport.rs +++ /dev/null @@ -1,48 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader, BufWriter}; -use std::path::Path; - -use crate::data::adjacency_list::AdjacencyList; -use crate::data::store; - -pub fn reexport( - from: &Path, - to: &Path, - in_parens: Option, - in_structure: Option, -) -> io::Result<()> { - eprintln!(">> Import"); - let mut from = BufReader::new(File::open(from)?); - let mut data = store::read_adjacency_list(&mut from)?; - - eprintln!(">> Consistency check"); - data.check_consistency(); - - if in_parens.is_some() || in_structure.is_some() { - eprintln!(">> Filtering"); - - let mut data2 = AdjacencyList::default(); - for (page_idx, page) in data.pages() { - data2.push_page(page.data.clone()); - for (_, link) in data.links(page_idx) { - if in_parens.is_some_and(|v| v != link.data.in_parens()) { - continue; - } - - if in_structure.is_some_and(|v| v != link.data.in_structure()) { - continue; - } - - data2.push_link(link.to, link.data); - } - } - - data = data2; - } - - eprintln!(">> Export"); - let mut to = BufWriter::new(File::create(to)?); - store::write_adjacency_list(&data, &mut to)?; - - Ok(()) -} diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs new file mode 100644 index 0000000..0c67388 --- /dev/null +++ b/brood/src/commands/show.rs @@ -0,0 +1,151 @@ +use std::{collections::HashSet, io}; + +use thousands::Separable; + +use crate::{ + data::Data, + util::{self, TitleNormalizer}, +}; + +/// Show info about a specific article. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + title: String, + + /// Print links in more detail. + #[arg(long, short)] + links: bool, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Locate article"); + let mut node = util::locate_title(&normalizer, &data, &self.title); + + loop { + let page = &data.pages[node.usize()]; + + const W_LABEL: usize = 12; + const W_NUM: usize = 11; + + println!(); + + println!("{:>W_LABEL$}: {}", "Title", page.title); + + println!( + "{:>W_LABEL$}: {}", + "Title (norm)", + normalizer.normalize(&page.title) + ); + + println!("{:>W_LABEL$}: {}", "Redirect", page.redirect); + + println!("{:>W_LABEL$}: {:>W_NUM$}", "ID", page.id); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Length", + page.length.separate_with_underscores() + ); + + let outlinks = data.graph.edge_slice(node).to_vec(); + let inlinks = data + .graph + .edges() + .filter(|(_, target)| *target == node) + .map(|(source, _)| source) + .collect::>(); + + let outlinks_set = outlinks.iter().copied().collect::>(); + let inlinks_set = inlinks.iter().copied().collect::>(); + let twins_set = outlinks_set + .intersection(&inlinks_set) + .copied() + .collect::>(); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links (out)", + outlinks.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "unique", + outlinks_set.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links (in)", + inlinks.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "unique", + inlinks_set.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Twins", + twins_set.len().separate_with_underscores() + ); + + if self.links { + let mut twin_pages = twins_set + .iter() + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + let mut outlink_only_pages = outlinks_set + .difference(&twins_set) + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + let mut inlink_only_pages = inlinks_set + .difference(&twins_set) + .map(|n| &data.pages[n.usize()]) + .collect::>(); + + twin_pages.sort_by_key(|p| &p.title); + outlink_only_pages.sort_by_key(|p| &p.title); + inlink_only_pages.sort_by_key(|p| &p.title); + + println!(); + println!("Twins ({}):", twin_pages.len().separate_with_underscores()); + for page in twin_pages { + println!("{}", util::fmt_page(page)); + } + + println!(); + println!( + "Only outlinks ({}):", + outlink_only_pages.len().separate_with_underscores() + ); + for page in outlink_only_pages { + println!("{}", util::fmt_page(page)); + } + + println!(); + println!( + "Only inlinks ({}):", + inlink_only_pages.len().separate_with_underscores() + ); + for page in inlink_only_pages { + println!("{}", util::fmt_page(page)); + } + } + + node = match data.redirect_target(node) { + Some(target) => target, + None => break, + }; + } + + Ok(()) + } +} diff --git a/brood/src/commands/stats.rs b/brood/src/commands/stats.rs new file mode 100644 index 0000000..760cec6 --- /dev/null +++ b/brood/src/commands/stats.rs @@ -0,0 +1,98 @@ +mod degrees; +mod redirects; + +use std::io; + +use thousands::Separable; + +use crate::data::Data; + +#[derive(Debug, clap::Parser)] +enum Command { + Degrees(degrees::Cmd), + Redirects(redirects::Cmd), +} + +/// Show interesting stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[command(subcommand)] + command: Option, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + if let Some(cmd) = self.command { + return match cmd { + Command::Degrees(cmd) => cmd.run(data), + Command::Redirects(cmd) => cmd.run(data), + }; + } + + println!(); + + const W_LABEL: usize = 14; + const W_NUM: usize = 11; + + let n_pages = data.pages.len(); + let n_redirects = data.pages.iter().filter(|p| p.redirect).count(); + let n_articles = n_pages - n_redirects; + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Pages", + n_pages.separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Articles", + n_articles.separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Redirects", + n_redirects.separate_with_underscores() + ); + + println!(); + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "Links", + data.links.len().separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "in parens", + data.links + .iter() + .filter(|l| l.in_parens()) + .count() + .separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "in structures", + data.links + .iter() + .filter(|l| l.in_structure()) + .count() + .separate_with_underscores() + ); + + println!( + "{:>W_LABEL$}: {:>W_NUM$}", + "pg eligible", + data.links + .iter() + .filter(|l| !l.in_parens() && !l.in_structure()) + .count() + .separate_with_underscores() + ); + + Ok(()) + } +} diff --git a/brood/src/commands/stats/degrees.rs b/brood/src/commands/stats/degrees.rs new file mode 100644 index 0000000..5bd05be --- /dev/null +++ b/brood/src/commands/stats/degrees.rs @@ -0,0 +1,92 @@ +use std::{cmp::Reverse, io}; + +use thousands::Separable; + +use crate::{ + algo, + data::{Data, Page}, + util, +}; + +/// Show stats on article in- and out-degrees. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[arg(long, short, default_value_t = 5)] + top: usize, +} + +impl Cmd { + pub fn run(self, mut data: Data) -> io::Result<()> { + println!(">> Outdegree"); + println!("> Counting links"); + let mut outdegree = vec![usize::MAX; data.pages.len()]; + for node in data.graph.nodes() { + outdegree[node.usize()] = data.graph.edge_range(node).len(); + } + + println!(">> Indegree"); + println!("> Inverting edges"); + algo::invert(&mut data); + let mut indegree = vec![usize::MAX; data.pages.len()]; + println!("> Counting links"); + for node in data.graph.nodes() { + indegree[node.usize()] = data.graph.edge_range(node).len(); + } + + let mut by_degrees = data + .pages + .iter() + .zip(outdegree) + .zip(indegree) + .map(|((p, od), id)| (p, od, id)) + .collect::>(); + + println!(); + println!("Most outlinks"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by_key(|(_, od, _)| Reverse(*od)); + self.print_links(&by_degrees); + + println!(); + println!("Most inlinks"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by_key(|(_, _, id)| Reverse(*id)); + self.print_links(&by_degrees); + + by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0); + + println!(); + println!("Most outlinks per non-zero inlink"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| { + let r1 = *od1 as f32 / *id1 as f32; + let r2 = *od2 as f32 / *id2 as f32; + r2.total_cmp(&r1) // Reverse order so max values are at beginnibg + }); + self.print_links(&by_degrees); + + println!(); + println!("Most inlinks per non-zero outlink"); + println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); + + by_degrees.reverse(); + self.print_links(&by_degrees); + + Ok(()) + } + + fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) { + for (i, (page, od, id)) in by_degrees.iter().take(self.top).enumerate() { + println!( + "{:3}. {} ({} out, {} in)", + i + 1, + util::fmt_page(page), + od.separate_with_underscores(), + id.separate_with_underscores() + ); + } + } +} diff --git a/brood/src/commands/stats/redirects.rs b/brood/src/commands/stats/redirects.rs new file mode 100644 index 0000000..6bf2204 --- /dev/null +++ b/brood/src/commands/stats/redirects.rs @@ -0,0 +1,107 @@ +use std::{cmp::Reverse, collections::HashSet, io}; + +use thousands::Separable; + +use crate::{data::Data, graph::NodeIdx, util}; + +fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> { + let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new(); + + for node in data.graph.nodes() { + if !data.pages[node.usize()].redirect { + continue; + } + + let mut seen = HashSet::new(); + + let mut curr = node; + seen.insert(node); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + println!(" Redirect loop: {}", data.pages[node.usize()].title); + break; + } + + curr = next; + seen.insert(next); + } + + redirects.push((node, curr, seen.len() - 1)); + } + + redirects +} + +fn follow_redirect(data: &Data, start: NodeIdx) -> Vec { + let mut seen = HashSet::new(); + let mut nodes = Vec::new(); + + let mut curr = start; + seen.insert(curr); + nodes.push(curr); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + break; + } + + curr = next; + seen.insert(curr); + nodes.push(curr); + } + + nodes +} + +/// Show redirect stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + /// Show more detailed info. + #[arg(long, short)] + long: bool, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + println!(">> Resolve redirects"); + let redirects = find_redirects(&data); + + println!( + "There is a total of {} redirects.", + redirects.len().separate_with_underscores() + ); + + let mut long = redirects + .iter() + .filter(|(_, _, l)| *l > 1) + .collect::>(); + long.sort_by_key(|(_, _, l)| Reverse(l)); + + println!( + "{} redirects take more than one step to reach an article.", + long.len().separate_with_underscores() + ); + + println!( + "The longest redirect chain takes {} steps.", + long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0), + ); + + println!("Though these redirect chains are usually swiftly fixed by bots."); + + if self.long { + println!(); + println!("Redirect chains with length > 1:"); + + for (start, _, _) in long { + println!(); + for step in follow_redirect(&data, *start) { + println!("{}", util::fmt_page(&data.pages[step.usize()])); + } + } + } + + Ok(()) + } +} diff --git a/brood/src/data.rs b/brood/src/data.rs index 16aa0eb..c253094 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,3 +1,218 @@ -pub mod adjacency_list; -pub mod info; -pub mod store; +use std::{ + fs::File, + io::{self, BufReader, BufWriter, Read, Write}, + path::Path, +}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +#[derive(Debug, Clone)] +pub struct Page { + pub id: u32, + pub title: String, + pub length: u32, + pub redirect: bool, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct Link { + pub start: u32, + pub len: u32, + pub flags: u8, +} + +impl Link { + pub fn in_parens(self) -> bool { + self.flags & 0b1 != 0 + } + + pub fn in_structure(self) -> bool { + self.flags & 0b10 != 0 + } +} + +fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u8(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 1]; + r.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) +} + +fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u16(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 2]; + r.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) +} + +fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> { + w.write_all(&n.to_le_bytes()) +} + +fn read_u32(r: &mut impl Read) -> io::Result { + let mut buf = [0_u8; 4]; + r.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(w, s.len() as u16)?; + w.write_all(s.as_bytes())?; + Ok(()) +} + +fn read_str(r: &mut impl Read) -> io::Result { + let len = read_u16(r)? as usize; + let mut buf = vec![0_u8; len]; + r.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) +} + +fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> { + write_u32(w, page.id)?; + write_u32(w, page.length)?; + write_u8(w, if page.redirect { 1 } else { 0 })?; + write_str(w, &page.title)?; + Ok(()) +} + +pub fn read_page(r: &mut impl Read) -> io::Result { + Ok(Page { + id: read_u32(r)?, + length: read_u32(r)?, + redirect: read_u8(r)? != 0, + title: read_str(r)?, + }) +} + +fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> { + write_u32(w, link.start)?; + write_u32(w, link.len)?; + write_u8(w, link.flags)?; + Ok(()) +} + +fn read_link(r: &mut impl Read) -> io::Result { + Ok(Link { + start: read_u32(r)?, + len: read_u32(r)?, + flags: read_u8(r)?, + }) +} + +#[derive(Default)] +pub struct Data { + pub pages: Vec, + pub links: Vec, + pub graph: Graph, +} + +impl Data { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(pages: usize, links: usize) -> Self { + Self { + pages: Vec::with_capacity(pages), + links: Vec::with_capacity(links), + graph: Graph::with_capacity(pages, links), + } + } + + fn write(&self, w: &mut impl Write) -> io::Result<()> { + assert!(self.pages.len() < u32::MAX as usize); + assert!(self.links.len() < u32::MAX as usize); + assert_eq!(self.pages.len(), self.graph.nodes.len()); + assert_eq!(self.links.len(), self.graph.edges.len()); + write_u32(w, self.pages.len() as u32)?; + write_u32(w, self.links.len() as u32)?; + + for page in &self.pages { + write_page(w, page)?; + } + + for link in &self.links { + write_link(w, link)?; + } + + for node in &self.graph.nodes { + write_u32(w, node.0)?; + } + + for edge in &self.graph.edges { + write_u32(w, edge.0)?; + } + + Ok(()) + } + + fn read(r: &mut impl Read) -> io::Result { + let n_pages = read_u32(r)?; + let n_links = read_u32(r)?; + + let mut result = Self::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + result.pages.push(read_page(r)?); + } + + for _ in 0..n_links { + result.links.push(read_link(r)?); + } + + for _ in 0..n_pages { + result.graph.nodes.push(EdgeIdx(read_u32(r)?)); + } + + for _ in 0..n_links { + result.graph.edges.push(NodeIdx(read_u32(r)?)); + } + + assert_eq!(result.pages.len(), result.graph.nodes.len()); + assert_eq!(result.links.len(), result.graph.edges.len()); + result.graph.check_consistency(); + Ok(result) + } + + pub fn write_to_file(&self, path: &Path) -> io::Result<()> { + let mut file = BufWriter::new(File::create(path)?); + self.write(&mut file) + } + + pub fn read_from_file(path: &Path) -> io::Result { + let mut file = BufReader::new(File::open(path)?); + Self::read(&mut file) + } + + pub fn check_consistency(&self) { + assert_eq!( + self.pages.len(), + self.graph.nodes.len(), + "inconsistent number of pages" + ); + + assert_eq!( + self.links.len(), + self.graph.edges.len(), + "inconsistent number of links" + ); + + self.graph.check_consistency(); + } + + pub fn redirect_target(&self, node: NodeIdx) -> Option { + if !self.pages[node.usize()].redirect { + return None; + } + + self.graph.edge_slice(node).first().copied() + } +} diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs deleted file mode 100644 index 04a1124..0000000 --- a/brood/src/data/adjacency_list.rs +++ /dev/null @@ -1,160 +0,0 @@ -use std::ops::Range; - -use super::info::{LinkInfo, PageInfo}; - -#[derive(Debug, Clone, Copy)] -pub struct Page

{ - /// Index of the first link belonging to this page. - pub start: u32, - pub data: P, -} - -impl

Page

{ - pub fn change_data(self, f: impl Fn(P) -> P2) -> Page { - Page { - start: self.start, - data: f(self.data), - } - } -} - -#[derive(Debug, Clone, Copy)] -pub struct Link { - /// Index of the page this link points to. - pub to: u32, - pub data: L, -} - -impl Link { - pub fn change_data(self, f: impl Fn(L) -> L2) -> Link { - Link { - to: self.to, - data: f(self.data), - } - } -} - -pub struct AdjacencyList { - pub pages: Vec>, - pub links: Vec>, -} - -impl Default for AdjacencyList { - fn default() -> Self { - Self { - pages: Default::default(), - links: Default::default(), - } - } -} - -impl AdjacencyList { - pub fn push_page(&mut self, data: P) { - self.pages.push(Page { - start: self.links.len() as u32, - data, - }); - } - - pub fn push_link(&mut self, to: u32, data: L) { - self.links.push(Link { to, data }) - } - - pub fn page(&self, page_idx: u32) -> &Page

{ - &self.pages[page_idx as usize] - } - - pub fn page_mut(&mut self, page_idx: u32) -> &mut Page

{ - &mut self.pages[page_idx as usize] - } - - pub fn pages(&self) -> impl Iterator)> { - self.pages.iter().enumerate().map(|(i, p)| (i as u32, p)) - } - - pub fn link(&self, link_idx: u32) -> &Link { - &self.links[link_idx as usize] - } - - pub fn link_mut(&mut self, link_idx: u32) -> &mut Link { - &mut self.links[link_idx as usize] - } - - pub fn link_range(&self, page_idx: u32) -> Range { - let start_idx = self.pages[page_idx as usize].start; - let end_idx = match self.pages.get(page_idx as usize + 1) { - Some(page) => page.start, - None => self.links.len() as u32, - }; - start_idx..end_idx - } - - pub fn link_redirect(&self, page_idx: u32) -> Option { - let range = self.link_range(page_idx); - if range.is_empty() { - None - } else { - Some(range.start) - } - } - - pub fn links(&self, page_idx: u32) -> impl Iterator)> { - self.link_range(page_idx).map(|i| (i, self.link(i))) - } - - pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { - let pages = self - .pages - .into_iter() - .map(|p| p.change_data(page_f)) - .collect::>(); - - AdjacencyList { - pages, - links: self.links, - } - } - - pub fn change_link_data(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList { - let links = self - .links - .into_iter() - .map(|l| l.change_data(link_f)) - .collect::>(); - - AdjacencyList { - pages: self.pages, - links, - } - } -} - -impl AdjacencyList { - pub fn check_consistency(&self) { - // Check that all types are large enough - assert!(self.pages.len() < u32::MAX as usize, "too many pages"); - assert!(self.links.len() < u32::MAX as usize, "too many links"); - for page in &self.pages { - assert!( - page.data.title.len() <= u8::MAX as usize, - "page title too long" - ); - } - - // Check that all links contain valid indices. Links must not link to - // the sentinel page. - let range = 0..self.pages.len() as u32; - for link in &self.links { - assert!(range.contains(&link.to), "invalid link"); - } - - // Check that all redirect pages have at most one link - for (page_idx, page) in self.pages.iter().enumerate() { - if page.data.redirect { - let range = self.link_range(page_idx as u32); - let amount = range.end - range.start; - assert!(amount <= 1, "too many redirect links"); - } - } - } -} diff --git a/brood/src/data/info.rs b/brood/src/data/info.rs deleted file mode 100644 index dad04d4..0000000 --- a/brood/src/data/info.rs +++ /dev/null @@ -1,24 +0,0 @@ -#[derive(Debug, Clone)] -pub struct PageInfo { - pub id: u32, - pub title: String, - pub length: u32, - pub redirect: bool, -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct LinkInfo { - pub start: u32, - pub len: u32, - pub flags: u8, -} - -impl LinkInfo { - pub fn in_parens(self) -> bool { - self.flags & 0b1 != 0 - } - - pub fn in_structure(self) -> bool { - self.flags & 0b10 != 0 - } -} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs deleted file mode 100644 index afba1a3..0000000 --- a/brood/src/data/store.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::io::{self, Read, Write}; - -use super::{ - adjacency_list::{AdjacencyList, Link, Page}, - info::{LinkInfo, PageInfo}, -}; - -fn write_u8(n: u8, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u8(from: &mut R) -> io::Result { - let mut buf = [0_u8; 1]; - from.read_exact(&mut buf)?; - Ok(u8::from_le_bytes(buf)) -} - -fn write_u16(n: u16, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u16(from: &mut R) -> io::Result { - let mut buf = [0_u8; 2]; - from.read_exact(&mut buf)?; - Ok(u16::from_le_bytes(buf)) -} - -fn write_u32(n: u32, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) -} - -fn read_u32(from: &mut R) -> io::Result { - let mut buf = [0_u8; 4]; - from.read_exact(&mut buf)?; - Ok(u32::from_le_bytes(buf)) -} - -fn write_str(s: &str, to: &mut W) -> io::Result<()> { - assert!(s.len() <= u16::MAX as usize); - write_u16(s.len() as u16, to)?; - to.write_all(s.as_bytes())?; - Ok(()) -} - -fn read_str(from: &mut R) -> io::Result { - let len = read_u16(from)? as usize; - let mut buf = vec![0_u8; len]; - from.read_exact(&mut buf)?; - Ok(String::from_utf8(buf).unwrap()) -} - -fn write_page(page: &Page, to: &mut W) -> io::Result<()> { - write_u32(page.start, to)?; - write_u32(page.data.id, to)?; - write_u32(page.data.length, to)?; - write_u8(if page.data.redirect { 1 } else { 0 }, to)?; - write_str(&page.data.title, to)?; - - Ok(()) -} - -pub fn read_page(from: &mut R) -> io::Result> { - let start_link_idx = read_u32(from)?; - let id = read_u32(from)?; - let length = read_u32(from)?; - let redirect = read_u8(from)? != 0; - let title = read_str(from)?; - - Ok(Page { - start: start_link_idx, - data: PageInfo { - id, - length, - redirect, - title, - }, - }) -} - -fn write_link(link: &Link, to: &mut W) -> io::Result<()> { - write_u32(link.to, to)?; - write_u32(link.data.start, to)?; - write_u32(link.data.len, to)?; - write_u8(link.data.flags, to)?; - - Ok(()) -} - -fn read_link(from: &mut R) -> io::Result> { - let to_page_idx = read_u32(from)?; - let start = read_u32(from)?; - let len = read_u32(from)?; - let flags = read_u8(from)?; - - Ok(Link { - to: to_page_idx, - data: LinkInfo { start, len, flags }, - }) -} - -pub fn write_adjacency_list( - al: &AdjacencyList, - to: &mut W, -) -> io::Result<()> { - write_u32(al.pages.len() as u32, to)?; - write_u32(al.links.len() as u32, to)?; - - for page in &al.pages { - write_page(page, to)?; - } - - for link in &al.links { - write_link(link, to)?; - } - - Ok(()) -} - -pub fn read_adjacency_list(from: &mut R) -> io::Result> { - let n_pages = read_u32(from)?; - let n_links = read_u32(from)?; - - let mut pages = vec![]; - for _ in 0..n_pages { - pages.push(read_page(from)?); - } - - let mut links = vec![]; - for _ in 0..n_links { - links.push(read_link(from)?); - } - - Ok(AdjacencyList { pages, links }) -} diff --git a/brood/src/graph.rs b/brood/src/graph.rs new file mode 100644 index 0000000..a869300 --- /dev/null +++ b/brood/src/graph.rs @@ -0,0 +1,295 @@ +use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct NodeIdx(pub u32); + +impl NodeIdx { + pub const NONE: Self = Self(u32::MAX); + + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for NodeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for NodeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct EdgeIdx(pub u32); + +impl EdgeIdx { + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for EdgeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for EdgeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Default)] +pub struct Graph { + /// A node points to the first of its edges. + /// + /// A special case is that if the subsequent node points to the same edge, + /// the current node has no edges. + pub nodes: Vec, + + /// An edge points to a target node. + /// + /// The source node is defined implicitly by the graph data structure. + pub edges: Vec, +} + +impl Graph { + pub fn with_capacity(nodes: usize, edges: usize) -> Self { + Self { + nodes: Vec::with_capacity(nodes), + edges: Vec::with_capacity(edges), + } + } + + pub fn add_node(&mut self) { + self.nodes.push(EdgeIdx::new(self.edges.len())); + } + + pub fn add_edge(&mut self, target: NodeIdx) { + self.edges.push(target); + } + + pub fn check_consistency(&self) { + if self.nodes.is_empty() { + assert!(self.edges.is_empty(), "edges must belong to existing nodes"); + return; + } + + assert!(self.nodes.len() < u32::MAX as usize, "too many nodes"); + assert!(self.edges.len() < u32::MAX as usize, "too many edges"); + + assert_eq!( + *self.nodes.first().unwrap(), + EdgeIdx(0), + "first node pointer must be 0" + ); + + for (ni, node) in self.nodes.iter().cloned().enumerate() { + assert!( + node.usize() <= self.edges.len(), + "node pointers must be in range" + ); + + if let Some(succ) = self.nodes.get(ni + 1) { + assert!(node <= *succ, "node pointers must be well-ordered"); + } + } + + for edge in &self.edges { + assert!( + edge.usize() < self.nodes.len(), + "edge pointers must be in range" + ); + } + } + + pub fn nodes(&self) -> impl Iterator + '_ { + (0..self.nodes.len()).map(NodeIdx::new) + } + + pub fn edges(&self) -> impl Iterator + '_ { + Edges::new(self) + } + + pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { + self.nodes + .get(node.usize()) + .copied() + .unwrap_or_else(|| self.edges.len().into()) + } + + pub fn edge_range(&self, node: NodeIdx) -> Range { + let start = self.nodes[node.usize()]; + let end = self.edge_start(node + 1); + start.usize()..end.usize() + } + + pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] { + &self.edges[self.edge_range(node)] + } +} + +struct Edges<'a> { + graph: &'a Graph, + ni: NodeIdx, + ei: EdgeIdx, +} + +impl<'a> Edges<'a> { + fn new(graph: &'a Graph) -> Self { + Self { + graph, + ni: NodeIdx(0), + ei: EdgeIdx(0), + } + } +} + +impl Iterator for Edges<'_> { + type Item = (NodeIdx, NodeIdx); + + fn next(&mut self) -> Option { + if self.ei.usize() >= self.graph.edges.len() { + return None; + } + let target = self.graph.edges[self.ei.usize()]; + + // if would not be sufficient because some nodes may not have any edges. + while self.ei >= self.graph.edge_start(self.ni + 1) { + self.ni += 1; + } + let source = self.ni; + + self.ei += 1; + Some((source, target)) + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index e4b4074..270aee8 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,49 +1,23 @@ -pub mod commands; +mod algo; +mod commands; mod data; +mod graph; mod util; -use std::io; -use std::path::PathBuf; +use std::{io, path::PathBuf}; use clap::Parser; - -#[derive(Debug, PartialEq, Eq, Parser)] -pub enum PhilosophyGameCmd { - First, - Canonical, - Cluster, - Trace { start: String }, -} +use data::Data; #[derive(Debug, Parser)] enum Command { - /// Read sift data on stdin and output brood data. - Ingest, - /// Read and reexport brood data. - Reexport { - to: PathBuf, - #[arg(long, short = 'P')] - in_parens: Option, - #[arg(long, short = 'S')] - in_structure: Option, - }, - /// Find a path from one article to another. - Path { - from: String, - to: String, - /// Flip start and end article. - #[arg(short, long)] - flip: bool, - }, - /// Find the longest shortest path starting at an article. - LongestShortestPath { from: String }, - /// Analyze articles using "Philosophy Game" rules. - PhilosophyGame { - #[command(subcommand)] - subcmd: PhilosophyGameCmd, - }, - /// Print all page titles. - ListPages, + Ingest(commands::ingest::Cmd), + Export(commands::export::Cmd), + Show(commands::show::Cmd), + Stats(commands::stats::Cmd), + Path(commands::path::Cmd), + LongestPath(commands::longest_path::Cmd), + Pg(commands::pg::Cmd), } #[derive(Debug, Parser)] @@ -51,30 +25,59 @@ struct Args { datafile: PathBuf, #[command(subcommand)] command: Command, + #[arg(long, short = 'P')] + in_parens: Option, + #[arg(long, short = 'S')] + in_structure: Option, + #[arg(long, short = 'R')] + resolve_redirects: bool, + #[arg(long, short = 'I')] + invert_edges: bool, + #[arg(long, short)] + check_consistency: bool, } fn main() -> io::Result<()> { let args = Args::parse(); + + if let Command::Ingest(cmd) = &args.command { + return cmd.run(&args.datafile); + } + + println!(">> Import"); + println!("> Reading data"); + let mut data = Data::read_from_file(&args.datafile)?; + + if args.in_parens.is_some() || args.in_structure.is_some() { + println!("> Filtering edges"); + algo::retain_edges(&mut data, |link| { + args.in_parens.is_none_or(|b| b == link.in_parens()) + && args.in_structure.is_none_or(|b| b == link.in_structure()) + }); + } + + if args.resolve_redirects { + println!("> Resolving redirects"); + algo::resolve_redirects(&mut data); + } + + if args.invert_edges { + println!("> Inverting edges"); + algo::invert(&mut data); + } + + if args.check_consistency { + println!("> Checking consistencey"); + data.check_consistency(); + } + match args.command { - Command::Ingest => commands::ingest::ingest(&args.datafile), - Command::Reexport { - to, - in_parens, - in_structure, - } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure), - Command::Path { from, to, flip } => { - if flip { - commands::path::path(&args.datafile, &to, &from) - } else { - commands::path::path(&args.datafile, &from, &to) - } - } - Command::LongestShortestPath { from } => { - commands::longest_shortest_path::run(&args.datafile, &from) - } - Command::PhilosophyGame { subcmd } => { - commands::philosophy_game::run(&args.datafile, subcmd) - } - Command::ListPages => commands::list_pages::run(&args.datafile), + Command::Ingest(_) => unreachable!(), + Command::Export(cmd) => cmd.run(data), + Command::Show(cmd) => cmd.run(data), + Command::Stats(cmd) => cmd.run(data), + Command::Path(cmd) => cmd.run(data), + Command::LongestPath(cmd) => cmd.run(data), + Command::Pg(cmd) => cmd.run(data), } } diff --git a/brood/src/util.rs b/brood/src/util.rs index e1a64ff..cc6ee42 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,39 +1,160 @@ -use crate::data::{ - adjacency_list::{AdjacencyList, Page}, - info::{LinkInfo, PageInfo}, +use std::{collections::HashSet, fmt}; + +use regex::Regex; + +use crate::{ + data::{Data, Page}, + graph::NodeIdx, }; -pub fn normalize_link(link: &str) -> String { - let link = link.trim().replace(' ', "_"); +// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js +struct PhpCharToUpper(char); - // Make only first char lowercase - link.chars() - .next() - .iter() - .flat_map(|c| c.to_lowercase()) - .chain(link.chars().skip(1)) - .collect::() -} +impl fmt::Display for PhpCharToUpper { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.0 { + // Do something special, I guess + 'ᾀ' => write!(f, "ᾈ"), + 'ᾁ' => write!(f, "ᾉ"), + 'ᾂ' => write!(f, "ᾊ"), + 'ᾃ' => write!(f, "ᾋ"), + 'ᾄ' => write!(f, "ᾌ"), + 'ᾅ' => write!(f, "ᾍ"), + 'ᾆ' => write!(f, "ᾎ"), + 'ᾇ' => write!(f, "ᾏ"), + 'ᾐ' => write!(f, "ᾘ"), + 'ᾑ' => write!(f, "ᾙ"), + 'ᾒ' => write!(f, "ᾚ"), + 'ᾓ' => write!(f, "ᾛ"), + 'ᾔ' => write!(f, "ᾜ"), + 'ᾕ' => write!(f, "ᾝ"), + 'ᾖ' => write!(f, "ᾞ"), + 'ᾗ' => write!(f, "ᾟ"), + 'ᾠ' => write!(f, "ᾨ"), + 'ᾡ' => write!(f, "ᾩ"), + 'ᾢ' => write!(f, "ᾪ"), + 'ᾣ' => write!(f, "ᾫ"), + 'ᾤ' => write!(f, "ᾬ"), + 'ᾥ' => write!(f, "ᾭ"), + 'ᾦ' => write!(f, "ᾮ"), + 'ᾧ' => write!(f, "ᾯ"), + 'ᾳ' => write!(f, "ᾼ"), + 'ῃ' => write!(f, "ῌ"), + 'ῳ' => write!(f, "ῼ"), -pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { - let title = normalize_link(title); - pages - .iter() - .enumerate() - .find(|(_, p)| normalize_link(&p.data.title) == title) - .map(|(i, _)| i) - .expect("invalid title") as u32 -} - -pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: u32) -> u32 { - loop { - if data.page(page_idx).data.redirect { - if let Some(link_idx) = data.link_redirect(page_idx) { - page_idx = data.link(link_idx).to; - continue; + // Do not capitalize + 'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ' + | 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ' + | 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ' + | 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ' + | 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ' + | 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ' + | 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ' + | 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ' + | 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ' + | 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ' + | 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ' + | 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ' + | 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩' + | '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷' + | '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => { + write!(f, "{}", self.0) } - } - return page_idx; + // Capitalize normally + c => write!(f, "{}", c.to_uppercase()), + } + } +} + +pub struct TitleNormalizer { + strip_bidi: Regex, + clean_up_whitespace: Regex, + trim_underscore_start: Regex, + trim_underscore_end: Regex, +} + +impl TitleNormalizer { + pub fn new() -> Self { + Self { + strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(), + + clean_up_whitespace: Regex::new(concat!( + "[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}", + "\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+" + )) + .unwrap(), + + trim_underscore_start: Regex::new("^_+").unwrap(), + + trim_underscore_end: Regex::new("_+$").unwrap(), + } + } + + /// Normalize an article title. + /// + /// See also . + pub fn normalize(&self, title: &str) -> String { + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403 + + // Strip Unicode bidi override characters + let title = self.strip_bidi.replace_all(title, ""); + + // Clean up whitespace + let title = self.clean_up_whitespace.replace_all(&title, "_"); + + // Trim _ from beginning and end + let title = self.trim_underscore_start.replace_all(&title, ""); + let title = self.trim_underscore_end.replace_all(&title, ""); + + // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206 + let Some(first) = title.chars().next() else { + return String::new(); + }; + let rest = &title[first.len_utf8()..]; + format!("{}{rest}", PhpCharToUpper(first)) + } +} + +pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { + let normalized = normalizer.normalize(title); + data.pages + .iter() + .enumerate() + .find(|(_, p)| normalizer.normalize(&p.title) == normalized) + .map(|(i, _)| NodeIdx::new(i)) + .expect("invalid title") +} + +pub fn resolve_redirects(data: &Data, node: NodeIdx) -> NodeIdx { + let mut curr = node; + let mut seen = HashSet::new(); + + seen.insert(curr); + while let Some(target) = data.redirect_target(curr) { + if seen.contains(&target) { + println!( + " Redirect cycle deteted: {:?}", + data.pages[node.usize()].title + ); + break; + } + + seen.insert(target); + curr = target; + } + + curr +} + +pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { + resolve_redirects(data, locate_title(normalizer, data, title)) +} + +pub fn fmt_page(page: &Page) -> String { + if page.redirect { + format!("v {}", page.title) + } else { + format!("- {}", page.title) } } diff --git a/sift/sift.py b/sift/sift.py index bde2e74..2562fa2 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -172,16 +172,21 @@ def process_xmldump_page(page): # Page info as simple tuples def simple_pages(input): dump = mwxml.Dump.from_file(sys.stdin) + articles = 0 for i, page in enumerate(dump.pages): + if (i + 1) % 1000 == 0: + # Yeah, the articles are usually off by one + eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}") + if page.namespace != 0: continue - if (i + 1) % 1000 == 0: - eprint(f"{i+1:8} pages, at pid {page.id:8}") - + articles += 1 [revision] = list(page) # Every page has exactly one revision yield page.id, page.title, revision.text or "", page.redirect + eprint(f"{articles} articles total") + def process_simple_page(info): pid, title, text, redirect = info