diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 180ca5c..0162043 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -1,21 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 4 - -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] +version = 3 [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" dependencies = [ "anstyle", "anstyle-parse", @@ -28,33 +19,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.6" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" dependencies = [ "anstyle", "windows-sys", @@ -65,30 +56,16 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", - "indicatif", - "regex", + "rustc-hash", "serde", "serde_json", - "thousands", ] -[[package]] -name = "bumpalo" -version = "3.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - [[package]] name = "clap" -version = "4.5.23" +version = "4.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" +checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f" dependencies = [ "clap_builder", "clap_derive", @@ -96,9 +73,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.23" +version = "4.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" +checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f" dependencies = [ "anstream", "anstyle", @@ -108,9 +85,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.18" +version = "4.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" dependencies = [ "heck", "proc-macro2", @@ -120,34 +97,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" - -[[package]] -name = "console" -version = "0.15.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" -dependencies = [ - "encode_unicode", - "libc", - "once_cell", - "unicode-width", - "windows-sys", -] - -[[package]] -name = "encode_unicode" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] name = "heck" @@ -155,123 +113,41 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "indicatif" -version = "0.17.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" -dependencies = [ - "console", - "number_prefix", - "portable-atomic", - "unicode-width", - "web-time", -] - [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" [[package]] name = "itoa" -version = "1.0.14" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" - -[[package]] -name = "js-sys" -version = "0.3.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" -dependencies = [ - "once_cell", - "wasm-bindgen", -] - -[[package]] -name = "libc" -version = "0.2.169" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" - -[[package]] -name = "log" -version = "0.4.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" - -[[package]] -name = "memchr" -version = "2.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" - -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - -[[package]] -name = "once_cell" -version = "1.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" - -[[package]] -name = "portable-atomic" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.38" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] [[package]] -name = "regex" -version = "1.11.1" +name = "rustc-hash" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "ryu" @@ -281,18 +157,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.217" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", @@ -301,12 +177,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.134" +version = "1.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" +checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" dependencies = [ "itoa", - "memchr", "ryu", "serde", ] @@ -319,32 +194,20 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.93" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] -[[package]] -name = "thousands" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" - [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" - -[[package]] -name = "unicode-width" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" @@ -352,84 +215,20 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "wasm-bindgen" -version = "0.2.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" -dependencies = [ - "cfg-if", - "once_cell", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" - -[[package]] -name = "web-time" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "windows-sys" -version = "0.59.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -443,48 +242,48 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" [[package]] name = "windows_i686_gnullvm" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" -version = "0.52.6" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 99890b6..940f920 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -4,9 +4,7 @@ version = "0.0.0" edition = "2021" [dependencies] -clap = { version = "4.5.23", features = ["derive", "deprecated"] } -indicatif = "0.17.9" -regex = "1.11.1" -serde = { version = "1.0.217", features = ["derive"] } -serde_json = "1.0.134" -thousands = "0.2.0" +clap = { version = "4.5.7", features = ["derive", "deprecated"] } +rustc-hash = "2.0.0" +serde = { version = "1.0.203", features = ["derive"] } +serde_json = "1.0.118" diff --git a/brood/src/algo.rs b/brood/src/algo.rs deleted file mode 100644 index ac1919f..0000000 --- a/brood/src/algo.rs +++ /dev/null @@ -1,4 +0,0 @@ -mod dijkstra; -mod edit; - -pub use self::{dijkstra::*, edit::*}; diff --git a/brood/src/algo/dijkstra.rs b/brood/src/algo/dijkstra.rs deleted file mode 100644 index b6bf26a..0000000 --- a/brood/src/algo/dijkstra.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::{cmp::Reverse, collections::BinaryHeap}; - -use crate::graph::{EdgeIdx, Graph, NodeIdx}; - -pub struct Dijkstra<'a> { - graph: &'a Graph, - cost: Vec, - pred: Vec, -} - -impl<'a> Dijkstra<'a> { - pub fn new(graph: &'a Graph) -> Self { - Self { - graph, - cost: vec![u32::MAX; graph.nodes.len()], - pred: vec![NodeIdx::NONE; graph.nodes.len()], - } - } - - pub fn run( - &mut self, - start: NodeIdx, - goal: impl Fn(NodeIdx) -> bool, - cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, - ) { - self.cost[start.usize()] = 0; - let mut queue = BinaryHeap::new(); - queue.push((Reverse(0), start)); - - while let Some((Reverse(curr_cost), curr)) = queue.pop() { - if goal(curr) { - break; // We've found the shortest path to our target - } - - // These seem to never actually occur - // if curr_cost > self.cost[curr.usize()] { - // continue; // Outdated entry - // } - - for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { - let next = self.graph.edges[edge.usize()]; - let next_cost = curr_cost + cost(curr, edge, next); - if next_cost < self.cost[next.usize()] { - self.cost[next.usize()] = next_cost; - self.pred[next.usize()] = curr; - queue.push((Reverse(next_cost), next)); - } - } - } - } - - #[inline] - pub fn cost(&self, node: NodeIdx) -> u32 { - self.cost[node.usize()] - } - - #[inline] - pub fn pred(&self, node: NodeIdx) -> NodeIdx { - self.pred[node.usize()] - } - - pub fn path(&self, goal: NodeIdx) -> Vec { - let mut path = vec![]; - let mut at = goal; - - loop { - path.push(at); - at = self.pred(at); - if at == NodeIdx::NONE { - break; - } - } - - path.reverse(); - path - } -} diff --git a/brood/src/algo/edit.rs b/brood/src/algo/edit.rs deleted file mode 100644 index 2be0c0a..0000000 --- a/brood/src/algo/edit.rs +++ /dev/null @@ -1,97 +0,0 @@ -use std::mem; - -use crate::{ - data::{Data, Link}, - graph::NodeIdx, - util, -}; - -pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) { - let mut links = mem::take(&mut data.links).into_iter(); - let graph = mem::take(&mut data.graph); - - for node in graph.nodes() { - data.graph.add_node(); - - for edge in graph.edge_slice(node) { - let link = links.next().unwrap(); - if f(&link) { - data.links.push(link); - data.graph.add_edge(*edge); - } - } - } -} - -pub fn resolve_redirects(data: &mut Data) { - // Permutation from input node to input node - let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()]; - for node in data.graph.nodes() { - perm_redirect[node.usize()] = util::resolve_redirects(data, node); - } - - // Permutation from input node to final node - let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()]; - let mut perm_retain_count = NodeIdx(0); - for (i, page) in data.pages.iter().enumerate() { - if !page.redirect { - perm_retain[i] = perm_retain_count; - perm_retain_count += 1; - } - } - - let mut pages = mem::take(&mut data.pages).into_iter(); - let mut links = mem::take(&mut data.links).into_iter(); - let graph = mem::take(&mut data.graph); - - for node in graph.nodes() { - let page = pages.next().unwrap(); - let new_node = perm_retain[node.usize()]; - - if new_node == NodeIdx::NONE { - // Skip all edges - for _ in graph.edge_slice(node) { - links.next().unwrap(); - } - continue; - } - - data.pages.push(page); - data.graph.add_node(); - - for edge in graph.edge_slice(node) { - let link = links.next().unwrap(); - let new_edge = perm_retain[perm_redirect[edge.usize()].usize()]; - - if new_edge == NodeIdx::NONE { - continue; - } - - data.links.push(link); - data.graph.add_edge(new_edge); - } - } -} - -pub fn invert(data: &mut Data) { - let links = mem::take(&mut data.links); - let graph = mem::take(&mut data.graph); - - let mut edges = graph - .edges() - .zip(links) - .map(|((source, target), link)| (source, target, link)) - .collect::>(); - - edges.sort_by_key(|(_, target, _)| *target); - - let mut edges = edges.into_iter().peekable(); - for node in graph.nodes() { - data.graph.add_node(); - while edges.peek().is_some_and(|(_, target, _)| *target <= node) { - let (source, _, link) = edges.next().unwrap(); - data.graph.add_edge(source); - data.links.push(link); - } - } -} diff --git a/brood/src/commands.rs b/brood/src/commands.rs index fbb29d7..ffff9d3 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,7 +1,6 @@ -pub mod export; pub mod ingest; -pub mod longest_path; +pub mod list_pages; +pub mod longest_shortest_path; pub mod path; -pub mod pg; -pub mod show; -pub mod stats; +pub mod philosophy_game; +pub mod reexport; diff --git a/brood/src/commands/export.rs b/brood/src/commands/export.rs deleted file mode 100644 index aad5dd8..0000000 --- a/brood/src/commands/export.rs +++ /dev/null @@ -1,17 +0,0 @@ -use std::{io, path::PathBuf}; - -use crate::data::Data; - -#[derive(Debug, clap::Parser)] -pub struct Cmd { - out: PathBuf, -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - println!(">> Export"); - data.write_to_file(&self.out)?; - - Ok(()) - } -} diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 74f5663..cda10d0 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,33 +1,16 @@ -use std::{ - collections::{hash_map::Entry, HashMap}, - fs::File, - io::{self, BufRead, BufReader, Seek}, - path::{Path, PathBuf}, -}; +use std::collections::hash_map::Entry; +use std::fs::File; +use std::io::{self, BufRead, BufReader, BufWriter}; +use std::path::Path; +use std::u32; -use indicatif::{ProgressBar, ProgressStyle}; +use rustc_hash::FxHashMap; use serde::Deserialize; -use thousands::Separable; -use crate::{ - data::{Data, Link, Page}, - graph::NodeIdx, - util::TitleNormalizer, -}; - -const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ "; - -fn seek_to_start(f: &mut BufReader) -> io::Result { - let size = f.seek(io::SeekFrom::End(0))?; - f.seek(io::SeekFrom::Start(0))?; - Ok(size) -} - -fn file_progress_style() -> ProgressStyle { - ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}") - .unwrap() - .progress_chars(PROGRESS_CHARS) -} +use crate::data::adjacency_list::{AdjacencyList, Page}; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; +use crate::util; #[derive(Deserialize)] struct JsonPage { @@ -38,161 +21,151 @@ struct JsonPage { redirect: Option, } -fn read_titles(f: &mut BufReader) -> io::Result> { - let size = seek_to_start(f)?; - let bar = ProgressBar::new(size).with_style(file_progress_style()); +/* +Importing is a tad complicated because of multiple criteria: - let mut titles = vec![]; +1. The data must be read in a single pass on stdin +2. The process should not consume a lot of memory + (can't store the decoded json data directly) +3. The process should result in a nice and compact adjacency list format - for line in bar.wrap_read(f).lines() { - let page = serde_json::from_str::(&line?).unwrap(); - titles.push(page.title); - } +Because of this, the import is a bit more complex and has two passes. - Ok(titles) +The first pass imports the data into an adjacency-list-like format, but the +`Link::to` field points to a title in `Titles` instead of a page. + +The second pass then resolves the links to page indices and throws away all +links that don't point to any known page. +*/ + +#[derive(Default)] +struct Titles { + /// Normalized titles + titles: Vec, + /// Map from normalized title to index in [`Self::titles`]. + map: FxHashMap, } -/// Returns a map from normalized title to the index in the brood data where the -/// article will appear. -/// -/// Titles in the title list are not always unique. When multiple identical -/// titles appear, all but one have to be discarded. Originally, I tried to be -/// smart and keep the last occurrence (under the assumption that its data would -/// be the newest), but this led to index-based bugs. Because of this, I now -/// keep the first occurrence. -fn compute_title_lookup( - normalizer: &TitleNormalizer, - titles: &[String], -) -> HashMap { - let mut title_lookup = HashMap::::new(); - - let bar = ProgressBar::new(titles.len() as u64) - .with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS)); - - for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() { - // The index where this article will appear in the final list, assuming - // it is not a duplicate. For ownership reasons, we compute this here - // instead of inside the Entry::Vacant branch of the following match. - let brood_i = title_lookup.len(); - - match title_lookup.entry(normalizer.normalize(title)) { - Entry::Vacant(entry) => { - entry.insert((sift_i as u32, brood_i as u32)); - } - Entry::Occupied(entry) => { - let prev_sift_i = entry.get().0; - let prev = &titles[prev_sift_i as usize]; - if prev == title { - bar.println(format!( - " {title:?} ({prev_sift_i}) occurs again at {sift_i}" - )); - } else { - bar.println(format!( - " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}", - normalizer.normalize(title) - )); - } +impl Titles { + fn insert(&mut self, title: String) -> u32 { + match self.map.entry(title.clone()) { + Entry::Occupied(occupied) => *occupied.get(), + Entry::Vacant(vacant) => { + let idx = self.titles.len() as u32; + self.titles.push(title); + vacant.insert(idx); + idx } } } - title_lookup + fn get(&self, i: u32) -> &str { + &self.titles[i as usize] + } } -fn read_page_data( - normalizer: &TitleNormalizer, - title_lookup: &HashMap, - f: &mut BufReader, -) -> io::Result { - let size = seek_to_start(f)?; - let bar = ProgressBar::new(size).with_style(file_progress_style()); +fn first_stage() -> io::Result<(AdjacencyList, Titles)> { + let mut titles = Titles::default(); + let mut result = AdjacencyList::default(); - let mut data = Data::new(); + let stdin = BufReader::new(io::stdin()); + for (i, line) in stdin.lines().enumerate() { + let json_page = serde_json::from_str::(&line?).unwrap(); - for (i, line) in bar.wrap_read(f).lines().enumerate() { - let page = serde_json::from_str::(&line?).unwrap(); - let normalized = normalizer.normalize(&page.title); - - let (sift_i, _) = title_lookup[&normalized]; - if i as u32 != sift_i { - // Articles may occur multiple times, and this is not the instance - // of the article we should keep. - bar.println(format!( - " Skipping {:?} ({i}) in favor of {sift_i}", - page.title - )); - continue; - } - - data.graph.add_node(); - data.pages.push(Page { - id: page.id, - title: page.title, - length: page.length, - redirect: page.redirect.is_some(), + result.push_page(PageInfo { + id: json_page.id, + length: json_page.length, + redirect: json_page.redirect.is_some(), + title: json_page.title, }); - let mut page_links = page.links; - if let Some(target) = page.redirect { - page_links.clear(); - let len = target.len() as u32; - page_links.push((target, 0, len, 0)); + if let Some(to) = json_page.redirect { + let to = titles.insert(util::normalize_link(&to)); + result.push_link(to, LinkInfo::default()); + } else { + for (to, start, len, flags) in json_page.links { + let to = titles.insert(util::normalize_link(&to)); + result.push_link(to, LinkInfo { start, len, flags }); + } } - for (target, start, len, flags) in page_links { - if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { - data.graph.add_edge(NodeIdx(*brood_i)); - data.links.push(Link { start, len, flags }); + if (i + 1) % 100_000 == 0 { + eprintln!("{} pages imported", i + 1) + } + } + + eprintln!("Pages: {}", result.pages.len()); + eprintln!("Links: {}", result.links.len()); + eprintln!("Titles: {}", titles.titles.len()); + eprintln!("Title map entries: {}", titles.map.len()); + + Ok((result, titles)) +} + +/// Create map from normalized title to index in pages. +fn initialize_pages_map(pages: &[Page]) -> FxHashMap { + let mut result = FxHashMap::default(); + for (i, p) in pages.iter().enumerate() { + match result.entry(util::normalize_link(&p.data.title)) { + Entry::Occupied(entry) => { + eprintln!( + "{:?} already exists at index {} as {:?}", + p.data.title, + entry.get(), + util::normalize_link(&p.data.title) + ); + } + Entry::Vacant(entry) => { + entry.insert(i as u32); } } } - - Ok(data) + result } -/// Convert sift data to brood data. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - /// The sift data file to ingest. - data: PathBuf, -} +fn second_stage( + first_stage: &AdjacencyList, + titles: &Titles, +) -> AdjacencyList { + let pages_map = initialize_pages_map(&first_stage.pages); + let mut result = AdjacencyList::default(); -impl Cmd { - pub fn run(&self, brood_data: &Path) -> io::Result<()> { - let normalizer = TitleNormalizer::new(); + for (page_idx, page) in first_stage.pages() { + result.push_page(page.data.clone()); - println!(">> First pass"); - let mut sift_data = BufReader::new(File::open(&self.data)?); + for (_, link) in first_stage.links(page_idx) { + let title = util::normalize_link(titles.get(link.to)); + if let Some(to) = pages_map.get(&title) { + // The link points to an existing article, we should keep it + result.push_link(*to, link.data); + } + } - println!("> Reading titles"); - let titles = read_titles(&mut sift_data)?; - - println!("> Computing title index lookup table"); - let title_lookup = compute_title_lookup(&normalizer, &titles); - drop(titles); // Don't hoard memory - - println!(">> Second pass"); - - println!("> Reading page data"); - let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; - assert_eq!(data.pages.len(), title_lookup.len()); - drop(title_lookup); // Don't hoard memory - drop(sift_data); // No longer needed - - println!("> Checking consistency"); - data.check_consistency(); - - println!(">> Export"); - println!( - "Pages: {:>13}", - data.pages.len().separate_with_underscores() - ); - println!( - "Links: {:>13}", - data.links.len().separate_with_underscores() - ); - data.write_to_file(brood_data)?; - - Ok(()) + if (page_idx + 1) % 100_000 == 0 { + eprintln!("{} pages imported", page_idx + 1) + } } + + eprintln!("Pages: {}", result.pages.len()); + eprintln!("Links: {}", result.links.len()); + eprintln!("Page map entries: {}", pages_map.len()); + + result +} + +pub fn ingest(datafile: &Path) -> io::Result<()> { + eprintln!(">> First stage"); + let (first_stage, titles) = first_stage()?; + + eprintln!(">> Second stage"); + let data = second_stage(&first_stage, &titles); + + eprintln!(">> Consistency check"); + data.check_consistency(); + + eprintln!(">> Export"); + let mut datafile = BufWriter::new(File::create(datafile)?); + store::write_adjacency_list(&data, &mut datafile)?; + + Ok(()) } diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs new file mode 100644 index 0000000..5f659ea --- /dev/null +++ b/brood/src/commands/list_pages.rs @@ -0,0 +1,23 @@ +use std::fs::File; +use std::io::{self, BufReader}; +use std::path::Path; + +use crate::data::store; + +pub fn run(datafile: &Path) -> io::Result<()> { + let mut databuf = BufReader::new(File::open(datafile)?); + let data = store::read_adjacency_list(&mut databuf)?; + + for (page_idx, page) in data.pages() { + if page.data.redirect { + for link_idx in data.link_range(page_idx) { + let target_page = data.page(data.link(link_idx).to); + println!("{:?} -> {:?}", page.data.title, target_page.data.title); + } + } else { + println!("{:?}", page.data.title); + } + } + + Ok(()) +} diff --git a/brood/src/commands/longest_path.rs b/brood/src/commands/longest_path.rs deleted file mode 100644 index 1ac8e40..0000000 --- a/brood/src/commands/longest_path.rs +++ /dev/null @@ -1,70 +0,0 @@ -use std::io; - -use crate::{ - algo::Dijkstra, - data::Data, - graph::NodeIdx, - util::{self, TitleNormalizer}, -}; - -/// Find the article with the longest shortest path away from the starting -/// article. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - start: String, - #[arg(long, short, default_value_t = 1)] - top: usize, -} - -fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { - let start = &data.pages[start.usize()].title; - let goal = &data.pages[goal.usize()].title; - - let Some((cost, path)) = path else { - println!("No path found from {start} to {goal}"); - return; - }; - - println!("Path found (cost {cost}, length {}):", path.len()); - - for page in path { - println!("{}", util::fmt_page(&data.pages[page.usize()])); - } -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - let normalizer = TitleNormalizer::new(); - - println!(">> Resolve article"); - let start = util::resolve_title(&normalizer, &data, &self.start); - println!("Start: {}", data.pages[start.usize()].title); - - println!(">> Search paths"); - println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&data.graph); - println!("> Running dijkstra"); - dijkstra.run( - start, - |_| false, - |source, _edge, _target| !data.pages[source.usize()].redirect as u32, - ); - - println!(">> Find longest paths"); - let mut costs = data - .graph - .nodes() - .map(|n| (dijkstra.cost(n), n)) - .filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please - .collect::>(); - costs.sort_unstable(); - - for (cost, goal) in costs.iter().rev().take(self.top) { - let path = dijkstra.path(*goal); - println!(); - print_path(&data, start, *goal, Some((*cost, path))); - } - - Ok(()) - } -} diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs new file mode 100644 index 0000000..e15eb17 --- /dev/null +++ b/brood/src/commands/longest_shortest_path.rs @@ -0,0 +1,173 @@ +use std::collections::BinaryHeap; +use std::fs::File; +use std::io::{self, BufReader}; +use std::path::Path; + +use crate::data::adjacency_list::AdjacencyList; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; +use crate::util; + +struct DijkstraPageInfo { + cost: u32, + /// Index of the previous page. + prev: u32, + redirect: bool, +} + +impl DijkstraPageInfo { + fn from_page_info(info: PageInfo) -> Self { + Self { + cost: u32::MAX, + prev: u32::MAX, + redirect: info.redirect, + } + } +} + +struct DijkstraLinkInfo { + cost: u32, +} + +impl DijkstraLinkInfo { + fn from_link_info(info: LinkInfo) -> Self { + Self { + cost: 1, + // cost: 1000 + info.start, + // cost: 10000 + info.start, + // cost: 1000 + info.start / 10, + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +struct Entry { + cost: u32, + page_idx: u32, +} + +impl Entry { + pub fn new(cost: u32, page_idx: u32) -> Self { + Self { cost, page_idx } + } +} + +// Manual implementation so the queue is a min-heap instead of a max-heap. +impl Ord for Entry { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other + .cost + .cmp(&self.cost) + .then_with(|| self.page_idx.cmp(&other.page_idx)) + } +} + +impl PartialOrd for Entry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// Closely matches the dijkstra example in [std::collections::binary_heap]. +fn full_dijkstra( + data: AdjacencyList, + from: u32, +) -> AdjacencyList { + println!("> Prepare state"); + let mut data = data + .change_page_data(DijkstraPageInfo::from_page_info) + .change_link_data(DijkstraLinkInfo::from_link_info); + let mut queue = BinaryHeap::new(); + data.page_mut(from).data.cost = 0; + queue.push(Entry::new(0, from)); + + println!("> Run dijkstra"); + while let Some(Entry { cost, page_idx }) = queue.pop() { + let page = data.page(page_idx); + if cost > page.data.cost { + // This queue entry is outdated + continue; + } + + let redirect = page.data.redirect; + for link_idx in data.link_range(page_idx) { + let link = data.link(link_idx); + + let next = Entry { + cost: cost + if redirect { 0 } else { link.data.cost }, + page_idx: link.to, + }; + + let target_page = data.page_mut(link.to); + if next.cost < target_page.data.cost { + target_page.data.cost = next.cost; + target_page.data.prev = page_idx; + queue.push(next); + } + } + } + + data +} + +fn find_longest_shortest_path( + data: AdjacencyList, + from: u32, +) -> Option> { + let to = data + .pages + .iter() + .enumerate() + .filter(|(_, p)| p.data.cost != u32::MAX) + .max_by_key(|(_, p)| p.data.cost)? + .0 as u32; + + let mut steps = vec![]; + let mut at = to; + loop { + steps.push(at); + at = data.page(at).data.prev; + if at == u32::MAX { + break; + }; + } + steps.reverse(); + if steps.first() == Some(&from) { + Some(steps) + } else { + None + } +} + +pub fn run(datafile: &Path, from: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let data = store::read_adjacency_list(&mut databuf)?; + let pages = data.pages.clone(); + + println!(">> Locate from and to"); + let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); + println!("From: {:?}", data.page(from_idx).data.title); + + println!(">> Find all shortest paths"); + let data = full_dijkstra(data, from_idx); + + println!(">> Find longest shortest path"); + let path = find_longest_shortest_path(data, from_idx); + + if let Some(path) = path { + println!("Path found:"); + for page_idx in path { + let page = &pages[page_idx as usize]; + if page.data.redirect { + println!(" v {:?}", page.data.title); + } else { + println!(" - {:?}", page.data.title); + } + } + } else { + println!("No path found"); + } + + Ok(()) +} diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 4f58bb6..82079d2 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,87 +1,159 @@ -use std::io; +use std::collections::BinaryHeap; +use std::fs::File; +use std::io::{self, BufReader}; +use std::path::Path; -use crate::{ - algo::Dijkstra, - data::Data, - graph::NodeIdx, - util::{self, TitleNormalizer}, -}; +use crate::data::adjacency_list::AdjacencyList; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; +use crate::util; -/// Find the shortest path between two articles. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - start: String, - goal: String, - - // Search for a path in both directions. - #[arg(long, short)] - bidi: bool, +struct DijkstraPageInfo { + cost: u32, + prev: u32, + redirect: bool, } -fn search_path(data: &Data, start: NodeIdx, goal: NodeIdx) -> Option<(u32, Vec)> { - println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&data.graph); - println!("> Running dijkstra"); - dijkstra.run( - start, - |node| node == goal, - |source, _edge, _target| !data.pages[source.usize()].redirect as u32, - ); - - if dijkstra.cost(goal) == u32::MAX { - return None; - } - - println!("> Collecting path"); - let cost = dijkstra.cost(goal); - let path = dijkstra.path(goal); - Some((cost, path)) -} - -fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { - let start = &data.pages[start.usize()].title; - let goal = &data.pages[goal.usize()].title; - - let Some((cost, path)) = path else { - println!("No path found from {start} to {goal}"); - return; - }; - - println!("Path found (cost {cost}, length {}):", path.len()); - - for page in path { - println!("{}", util::fmt_page(&data.pages[page.usize()])); +impl DijkstraPageInfo { + fn from_page_info(info: PageInfo) -> Self { + Self { + cost: u32::MAX, + prev: u32::MAX, + redirect: info.redirect, + } } } -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - let normalizer = TitleNormalizer::new(); +struct DijkstraLinkInfo { + cost: u32, +} - println!(">> Resolve articles"); - let start = util::resolve_title(&normalizer, &data, &self.start); - let goal = util::resolve_title(&normalizer, &data, &self.goal); - println!("Start: {}", data.pages[start.usize()].title); - println!("Goal: {}", data.pages[goal.usize()].title); +impl DijkstraLinkInfo { + fn from_link_info(info: LinkInfo) -> Self { + Self { + cost: 1, + // cost: 1000 + info.start, + // cost: 10000 + info.start, + // cost: 1000 + info.start / 10, + } + } +} - if self.bidi { - println!(">> Find path forward"); - let forward = search_path(&data, start, goal); - println!(">> Find path backward"); - let backward = search_path(&data, goal, start); +#[derive(Clone, Copy, PartialEq, Eq)] +struct Entry { + cost: u32, + page_idx: u32, +} - println!(); - print_path(&data, start, goal, forward); - println!(); - print_path(&data, goal, start, backward); - } else { - println!(">> Find path"); - let path = search_path(&data, start, goal); +impl Entry { + pub fn new(cost: u32, page_idx: u32) -> Self { + Self { cost, page_idx } + } +} - println!(); - print_path(&data, start, goal, path); +// Manual implementation so the queue is a min-heap instead of a max-heap. +impl Ord for Entry { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other + .cost + .cmp(&self.cost) + .then_with(|| self.page_idx.cmp(&other.page_idx)) + } +} + +impl PartialOrd for Entry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// Closely matches the dijkstra example in [std::collections::binary_heap]. +fn dijkstra(data: AdjacencyList, from: u32, to: u32) -> Option> { + println!("> Prepare state"); + let mut data = data + .change_page_data(DijkstraPageInfo::from_page_info) + .change_link_data(DijkstraLinkInfo::from_link_info); + let mut queue = BinaryHeap::new(); + data.page_mut(from).data.cost = 0; + queue.push(Entry::new(0, from)); + + println!("> Run dijkstra"); + while let Some(Entry { cost, page_idx }) = queue.pop() { + if page_idx == to { + // We've found the shortest path to our target + break; } - Ok(()) + let page = data.page(page_idx); + if cost > page.data.cost { + // This queue entry is outdated + continue; + } + + let redirect = page.data.redirect; + for link_idx in data.link_range(page_idx) { + let link = data.link(link_idx); + + let next = Entry { + cost: cost + if redirect { 0 } else { link.data.cost }, + page_idx: link.to, + }; + + let target_page = data.page_mut(link.to); + if next.cost < target_page.data.cost { + target_page.data.cost = next.cost; + target_page.data.prev = page_idx; + queue.push(next); + } + } + } + + println!("> Collect results"); + let mut steps = vec![]; + let mut at = to; + loop { + steps.push(at); + at = data.page(at).data.prev; + if at == u32::MAX { + break; + }; + } + steps.reverse(); + if steps.first() == Some(&from) { + Some(steps) + } else { + None } } + +pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let data = store::read_adjacency_list(&mut databuf)?; + let pages = data.pages.clone(); + + println!(">> Locate from and to"); + let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); + let to_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, to)); + println!("From: {:?}", data.page(from_idx).data.title); + println!("To: {:?}", data.page(to_idx).data.title); + + println!(">> Find path"); + let path = dijkstra(data, from_idx, to_idx); + + if let Some(path) = path { + println!("Path found:"); + for page_idx in path { + let page = &pages[page_idx as usize]; + if page.data.redirect { + println!(" v {:?}", page.data.title); + } else { + println!(" - {:?}", page.data.title); + } + } + } else { + println!("No path found"); + } + + Ok(()) +} diff --git a/brood/src/commands/pg.rs b/brood/src/commands/pg.rs deleted file mode 100644 index a106a3a..0000000 --- a/brood/src/commands/pg.rs +++ /dev/null @@ -1,273 +0,0 @@ -use std::{ - collections::{BTreeSet, HashMap, HashSet}, - io::{self, BufWriter}, -}; - -use crate::{ - data::Data, - graph::NodeIdx, - util::{self, TitleNormalizer}, -}; - -struct PageMap(Vec); - -impl PageMap { - fn new(len: usize) -> Self { - Self(vec![NodeIdx::NONE; len]) - } - - fn get(&self, node: NodeIdx) -> NodeIdx { - self.0[node.usize()] - } - - fn set(&mut self, node: NodeIdx, to: NodeIdx) { - self.0[node.usize()] = to; - } -} - -fn first_viable_link(data: &Data, node: NodeIdx) -> Option { - for edge in data.graph.edge_slice(node) { - let link = &data.links[edge.usize()]; - if !link.in_parens() && !link.in_structure() { - return Some(*edge); - } - } - None -} - -fn find_forward_edges(data: &Data) -> PageMap { - let mut result = PageMap::new(data.pages.len()); - for node in data.graph.nodes() { - if let Some(first_link) = first_viable_link(data, node) { - result.set(node, first_link); - } - } - result -} - -fn find_clusters(data: &Data, forward: &PageMap) -> PageMap { - let mut cluster = PageMap::new(data.pages.len()); - for node in data.graph.nodes() { - let mut current = node; - let mut visited = HashSet::new(); - let canonical = loop { - // We've already determined the canonical element for this page. - if cluster.get(current) != NodeIdx::NONE { - break cluster.get(current); - } - - // We've hit a loop - if visited.contains(¤t) { - let mut loop_members = BTreeSet::new(); - while !loop_members.contains(¤t) { - loop_members.insert(current); - current = forward.get(current); - } - break loop_members.pop_first().unwrap(); - } - - visited.insert(current); - - let next = forward.get(current); - if next == NodeIdx::NONE { - // We've hit a dead-end - break current; - } - - current = next; - }; - - for i in visited { - cluster.set(i, canonical); - } - } - - cluster -} - -enum Cluster { - DeadEnd(NodeIdx), - Loop(Vec), -} - -fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { - let mut result = HashMap::new(); - for canonical in cluster.0.iter().copied().collect::>() { - if forward.get(canonical) == NodeIdx::NONE { - result.insert(canonical, Cluster::DeadEnd(canonical)); - continue; - } - - let mut members = vec![]; - let mut current = canonical; - loop { - members.push(current); - current = forward.get(current); - if current == canonical { - break; - } - } - result.insert(canonical, Cluster::Loop(members)); - } - - result -} - -fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> { - let map = forward - .0 - .iter() - .enumerate() - .map(|(node, first_link)| { - let page_title = &data.pages[node].title; - let first_link_title = if *first_link == NodeIdx::NONE { - None - } else { - Some(&data.pages[first_link.usize()].title) - }; - (page_title, first_link_title) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) { - let start_idx = util::resolve_title(normalizer, data, start); - - let mut current = start_idx; - let mut visited = HashSet::new(); - loop { - let page = &data.pages[current.usize()]; - let title = &page.title; - if page.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - - visited.insert(current); - - let next = forward.get(current); - - if next == NodeIdx::NONE { - println!("> dead-end reached"); - return; - } - - if visited.contains(&next) { - let page = &data.pages[next.usize()]; - let title = &page.title; - println!("> loop detected ({title})"); - return; - } - - current = next; - } -} - -fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> { - let map = cluster - .0 - .iter() - .enumerate() - .map(|(page, canonical)| { - ( - &data.pages[page].title, - &data.pages[canonical.usize()].title, - ) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -#[derive(Debug, PartialEq, Eq, clap::Parser)] -enum Command { - First, - Trace { start: String }, - Canonical, - Cluster, -} - -/// Show interesting stats. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - #[command(subcommand)] - command: Command, -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - let normalizer = TitleNormalizer::new(); - - eprintln!(">> Forward"); - let forward = find_forward_edges(&data); - - match self.command { - Command::First => { - eprintln!(">> First links"); - print_forward_edges_as_json(&data, &forward)?; - return Ok(()); - } - Command::Trace { start } => { - eprintln!(">> Tracing"); - print_trace(&normalizer, &data, &forward, &start); - return Ok(()); - } - _ => {} - } - - // Determine cluster for each page, represented via canonical page. The - // canonical page of a cluster is either a dead-end or the loop member with - // the smallest index. - eprintln!(">> Find clusters"); - let cluster = find_clusters(&data, &forward); - - if self.command == Command::Canonical { - print_canonical_pages_as_json(&data, &cluster)?; - return Ok(()); - } - - // Measure cluster size - eprintln!(">> Measure clusters"); - let mut cluster_size = HashMap::::new(); - for (i, canonical) in cluster.0.iter().enumerate() { - assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title); - *cluster_size.entry(*canonical).or_default() += 1; - } - let mut cluster_by_size = cluster_size.into_iter().collect::>(); - cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); - cluster_by_size.reverse(); - - // Print clusters - assert!(self.command == Command::Cluster); - let resolved = resolve_clusters(&forward, &cluster); - for (canonical, size) in cluster_by_size { - match resolved.get(&canonical).unwrap() { - Cluster::DeadEnd(page) => { - let title = &data.pages[page.usize()].title; - println!("Cluster (dead-end, {size}): {title}"); - } - Cluster::Loop(pages) => { - println!("Cluster ({}-loop, {size}):", pages.len()); - for page in pages { - let page = &data.pages[page.usize()]; - let title = &page.title; - if page.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - } - } - } - } - - Ok(()) - } -} diff --git a/brood/src/commands/philosophy_game.rs b/brood/src/commands/philosophy_game.rs new file mode 100644 index 0000000..178df1d --- /dev/null +++ b/brood/src/commands/philosophy_game.rs @@ -0,0 +1,267 @@ +use std::{ + collections::{BTreeSet, HashMap, HashSet}, + fs::File, + io::{self, BufReader, BufWriter}, + path::Path, +}; + +use crate::{ + data::{ + adjacency_list::AdjacencyList, + info::{LinkInfo, PageInfo}, + store, + }, + util, PhilosophyGameCmd, +}; + +struct PageMap(Vec); + +impl PageMap { + fn new(len: usize) -> Self { + Self(vec![u32::MAX; len]) + } + + fn get(&self, page_idx: u32) -> u32 { + self.0[page_idx as usize] + } + + fn set(&mut self, page_idx: u32, to: u32) { + self.0[page_idx as usize] = to; + } +} + +fn first_viable_link(data: &AdjacencyList, page_idx: u32) -> Option { + for link_idx in data.link_range(page_idx) { + let link = data.link(link_idx); + if !link.data.in_parens() && !link.data.in_structure() { + return Some(link.to); + } + } + None +} + +fn find_forward_edges(data: &AdjacencyList) -> PageMap { + let mut result = PageMap::new(data.pages.len()); + for (page_idx, _) in data.pages() { + if let Some(first_link) = first_viable_link(data, page_idx) { + result.set(page_idx, first_link); + } + } + result +} + +fn find_clusters(data: &AdjacencyList, forward: &PageMap) -> PageMap { + let mut cluster = PageMap::new(data.pages.len()); + for (page_idx, _) in data.pages() { + let mut current = page_idx; + let mut visited = HashSet::new(); + let canonical = loop { + // We've already determined the canonical element for this page. + if cluster.get(current) != u32::MAX { + break cluster.get(current); + } + + // We've hit a loop + if visited.contains(¤t) { + let mut loop_members = BTreeSet::new(); + while !loop_members.contains(¤t) { + loop_members.insert(current); + current = forward.get(current); + } + break loop_members.pop_first().unwrap(); + } + + visited.insert(current); + + let next = forward.get(current); + if next == u32::MAX { + // We've hit a dead-end + break current; + } + + current = next; + }; + + for i in visited { + cluster.set(i, canonical); + } + } + + cluster +} + +enum Cluster { + DeadEnd(u32), + Loop(Vec), +} + +fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { + let mut result = HashMap::new(); + for canonical in cluster.0.iter().copied().collect::>() { + if forward.get(canonical) == u32::MAX { + result.insert(canonical, Cluster::DeadEnd(canonical)); + continue; + } + + let mut members = vec![]; + let mut current = canonical; + loop { + members.push(current); + current = forward.get(current); + if current == canonical { + break; + } + } + result.insert(canonical, Cluster::Loop(members)); + } + + result +} + +fn print_forward_edges_as_json( + data: &AdjacencyList, + forward: &PageMap, +) -> io::Result<()> { + let map = forward + .0 + .iter() + .enumerate() + .map(|(page, first_link)| { + let page_title = &data.page(page as u32).data.title; + let first_link_title = if *first_link == u32::MAX { + None + } else { + Some(&data.page(*first_link).data.title) + }; + (page_title, first_link_title) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +fn print_trace(data: &AdjacencyList, forward: &PageMap, start: &str) { + let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start)); + + let mut current = start_idx; + let mut visited = HashSet::new(); + loop { + let page = data.page(current); + let title = &page.data.title; + if page.data.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + + visited.insert(current); + + let next = forward.get(current); + + if next == u32::MAX { + println!("dead-end reached"); + return; + } + + if visited.contains(&next) { + println!("loop detected"); + return; + } + + current = next; + } +} + +fn print_canonical_pages_as_json( + data: &AdjacencyList, + cluster: &PageMap, +) -> io::Result<()> { + let map = cluster + .0 + .iter() + .enumerate() + .map(|(page, canonical)| { + ( + &data.page(page as u32).data.title, + &data.page(*canonical).data.title, + ) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> { + eprintln!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let data = store::read_adjacency_list(&mut databuf)?; + + eprintln!(">> Forward"); + let forward = find_forward_edges(&data); + + match subcmd { + PhilosophyGameCmd::First => { + eprintln!(">> First links"); + print_forward_edges_as_json(&data, &forward)?; + return Ok(()); + } + PhilosophyGameCmd::Trace { start } => { + eprintln!(">> Tracing"); + print_trace(&data, &forward, &start); + return Ok(()); + } + _ => {} + } + + // Determine cluster for each page, represented via canonical page. The + // canonical page of a cluster is either a dead-end or the loop member with + // the smallest index. + eprintln!(">> Find clusters"); + let cluster = find_clusters(&data, &forward); + + if subcmd == PhilosophyGameCmd::Canonical { + print_canonical_pages_as_json(&data, &cluster)?; + return Ok(()); + } + + // Measure cluster size + eprintln!(">> Measure clusters"); + let mut cluster_size = HashMap::::new(); + for (i, canonical) in cluster.0.iter().enumerate() { + assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title); + *cluster_size.entry(*canonical).or_default() += 1; + } + let mut cluster_by_size = cluster_size.into_iter().collect::>(); + cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); + cluster_by_size.reverse(); + + // Print clusters + assert!(subcmd == PhilosophyGameCmd::Cluster); + let resolved = resolve_clusters(&forward, &cluster); + for (canonical, size) in cluster_by_size { + match resolved.get(&canonical).unwrap() { + Cluster::DeadEnd(page) => { + let title = &data.page(*page).data.title; + println!("Cluster (dead-end, {size}): {title}"); + } + Cluster::Loop(pages) => { + println!("Cluster ({}-loop, {size}):", pages.len()); + for page in pages { + let page = data.page(*page); + let title = &page.data.title; + if page.data.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + } + } + } + } + + Ok(()) +} diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs new file mode 100644 index 0000000..4c17d82 --- /dev/null +++ b/brood/src/commands/reexport.rs @@ -0,0 +1,104 @@ +use std::collections::{HashMap, HashSet}; +use std::fs::{self, File}; +use std::io::{self, BufReader, BufWriter}; +use std::path::{Path, PathBuf}; + +use serde::Deserialize; + +use crate::data::adjacency_list::AdjacencyList; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; +use crate::util; + +#[derive(Deserialize)] +struct FilterFile { + title: String, + language: String, +} + +fn filter_pages( + data: &AdjacencyList, + keep: HashSet, +) -> AdjacencyList { + // Map from old to new indices. Only contains entries for pages to keep. + let mut index_map = HashMap::new(); + for (page_idx, page) in data.pages() { + if keep.contains(&util::normalize_link(&page.data.title)) { + index_map.insert(page_idx, index_map.len() as u32); + } + } + + // Create new adjacency list in a single pass + let mut result = AdjacencyList::default(); + for (page_idx, page) in data.pages() { + let Some(new_idx) = index_map.get(&page_idx) else { + continue; + }; + + let actual_new_idx = result.push_page(page.data.clone()); + assert!(*new_idx == actual_new_idx); + + for (_, link) in data.links(page_idx) { + if let Some(to) = index_map.get(&link.to) { + result.push_link(*to, link.data); + } + } + } + + result +} + +pub fn reexport( + from: &Path, + to: &Path, + in_parens: Option, + in_structure: Option, + filter: Option, +) -> io::Result<()> { + eprintln!(">> Import"); + let mut from = BufReader::new(File::open(from)?); + let mut data = store::read_adjacency_list(&mut from)?; + + eprintln!(">> Consistency check"); + data.check_consistency(); + + if in_parens.is_some() || in_structure.is_some() || filter.is_some() { + eprintln!(">> Filtering"); + + let mut data2 = AdjacencyList::default(); + for (page_idx, page) in data.pages() { + data2.push_page(page.data.clone()); + for (_, link) in data.links(page_idx) { + if in_parens.is_some_and(|v| v != link.data.in_parens()) { + continue; + } + + if in_structure.is_some_and(|v| v != link.data.in_structure()) { + continue; + } + + data2.push_link(link.to, link.data); + } + } + + data = data2; + + if let Some(filter) = filter { + let filter = fs::read_to_string(filter)?; + let filter = serde_json::from_str::>(&filter).unwrap(); + let keep = filter + .into_iter() + .filter(|f| f.language == "en") + .map(|f| f.title) + .map(|t| util::normalize_link(&t)) + .collect::>(); + data = filter_pages(&data, keep); + } + } + + eprintln!(">> Export"); + let mut to = BufWriter::new(File::create(to)?); + store::write_adjacency_list(&data, &mut to)?; + + Ok(()) +} diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs deleted file mode 100644 index 0c67388..0000000 --- a/brood/src/commands/show.rs +++ /dev/null @@ -1,151 +0,0 @@ -use std::{collections::HashSet, io}; - -use thousands::Separable; - -use crate::{ - data::Data, - util::{self, TitleNormalizer}, -}; - -/// Show info about a specific article. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - title: String, - - /// Print links in more detail. - #[arg(long, short)] - links: bool, -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - let normalizer = TitleNormalizer::new(); - - println!(">> Locate article"); - let mut node = util::locate_title(&normalizer, &data, &self.title); - - loop { - let page = &data.pages[node.usize()]; - - const W_LABEL: usize = 12; - const W_NUM: usize = 11; - - println!(); - - println!("{:>W_LABEL$}: {}", "Title", page.title); - - println!( - "{:>W_LABEL$}: {}", - "Title (norm)", - normalizer.normalize(&page.title) - ); - - println!("{:>W_LABEL$}: {}", "Redirect", page.redirect); - - println!("{:>W_LABEL$}: {:>W_NUM$}", "ID", page.id); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Length", - page.length.separate_with_underscores() - ); - - let outlinks = data.graph.edge_slice(node).to_vec(); - let inlinks = data - .graph - .edges() - .filter(|(_, target)| *target == node) - .map(|(source, _)| source) - .collect::>(); - - let outlinks_set = outlinks.iter().copied().collect::>(); - let inlinks_set = inlinks.iter().copied().collect::>(); - let twins_set = outlinks_set - .intersection(&inlinks_set) - .copied() - .collect::>(); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Links (out)", - outlinks.len().separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "unique", - outlinks_set.len().separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Links (in)", - inlinks.len().separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "unique", - inlinks_set.len().separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Twins", - twins_set.len().separate_with_underscores() - ); - - if self.links { - let mut twin_pages = twins_set - .iter() - .map(|n| &data.pages[n.usize()]) - .collect::>(); - - let mut outlink_only_pages = outlinks_set - .difference(&twins_set) - .map(|n| &data.pages[n.usize()]) - .collect::>(); - - let mut inlink_only_pages = inlinks_set - .difference(&twins_set) - .map(|n| &data.pages[n.usize()]) - .collect::>(); - - twin_pages.sort_by_key(|p| &p.title); - outlink_only_pages.sort_by_key(|p| &p.title); - inlink_only_pages.sort_by_key(|p| &p.title); - - println!(); - println!("Twins ({}):", twin_pages.len().separate_with_underscores()); - for page in twin_pages { - println!("{}", util::fmt_page(page)); - } - - println!(); - println!( - "Only outlinks ({}):", - outlink_only_pages.len().separate_with_underscores() - ); - for page in outlink_only_pages { - println!("{}", util::fmt_page(page)); - } - - println!(); - println!( - "Only inlinks ({}):", - inlink_only_pages.len().separate_with_underscores() - ); - for page in inlink_only_pages { - println!("{}", util::fmt_page(page)); - } - } - - node = match data.redirect_target(node) { - Some(target) => target, - None => break, - }; - } - - Ok(()) - } -} diff --git a/brood/src/commands/stats.rs b/brood/src/commands/stats.rs deleted file mode 100644 index 760cec6..0000000 --- a/brood/src/commands/stats.rs +++ /dev/null @@ -1,98 +0,0 @@ -mod degrees; -mod redirects; - -use std::io; - -use thousands::Separable; - -use crate::data::Data; - -#[derive(Debug, clap::Parser)] -enum Command { - Degrees(degrees::Cmd), - Redirects(redirects::Cmd), -} - -/// Show interesting stats. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - #[command(subcommand)] - command: Option, -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - if let Some(cmd) = self.command { - return match cmd { - Command::Degrees(cmd) => cmd.run(data), - Command::Redirects(cmd) => cmd.run(data), - }; - } - - println!(); - - const W_LABEL: usize = 14; - const W_NUM: usize = 11; - - let n_pages = data.pages.len(); - let n_redirects = data.pages.iter().filter(|p| p.redirect).count(); - let n_articles = n_pages - n_redirects; - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Pages", - n_pages.separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Articles", - n_articles.separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Redirects", - n_redirects.separate_with_underscores() - ); - - println!(); - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "Links", - data.links.len().separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "in parens", - data.links - .iter() - .filter(|l| l.in_parens()) - .count() - .separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "in structures", - data.links - .iter() - .filter(|l| l.in_structure()) - .count() - .separate_with_underscores() - ); - - println!( - "{:>W_LABEL$}: {:>W_NUM$}", - "pg eligible", - data.links - .iter() - .filter(|l| !l.in_parens() && !l.in_structure()) - .count() - .separate_with_underscores() - ); - - Ok(()) - } -} diff --git a/brood/src/commands/stats/degrees.rs b/brood/src/commands/stats/degrees.rs deleted file mode 100644 index 5bd05be..0000000 --- a/brood/src/commands/stats/degrees.rs +++ /dev/null @@ -1,92 +0,0 @@ -use std::{cmp::Reverse, io}; - -use thousands::Separable; - -use crate::{ - algo, - data::{Data, Page}, - util, -}; - -/// Show stats on article in- and out-degrees. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - #[arg(long, short, default_value_t = 5)] - top: usize, -} - -impl Cmd { - pub fn run(self, mut data: Data) -> io::Result<()> { - println!(">> Outdegree"); - println!("> Counting links"); - let mut outdegree = vec![usize::MAX; data.pages.len()]; - for node in data.graph.nodes() { - outdegree[node.usize()] = data.graph.edge_range(node).len(); - } - - println!(">> Indegree"); - println!("> Inverting edges"); - algo::invert(&mut data); - let mut indegree = vec![usize::MAX; data.pages.len()]; - println!("> Counting links"); - for node in data.graph.nodes() { - indegree[node.usize()] = data.graph.edge_range(node).len(); - } - - let mut by_degrees = data - .pages - .iter() - .zip(outdegree) - .zip(indegree) - .map(|((p, od), id)| (p, od, id)) - .collect::>(); - - println!(); - println!("Most outlinks"); - println!("¯¯¯¯¯¯¯¯¯¯¯¯¯"); - - by_degrees.sort_by_key(|(_, od, _)| Reverse(*od)); - self.print_links(&by_degrees); - - println!(); - println!("Most inlinks"); - println!("¯¯¯¯¯¯¯¯¯¯¯¯"); - - by_degrees.sort_by_key(|(_, _, id)| Reverse(*id)); - self.print_links(&by_degrees); - - by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0); - - println!(); - println!("Most outlinks per non-zero inlink"); - println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); - - by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| { - let r1 = *od1 as f32 / *id1 as f32; - let r2 = *od2 as f32 / *id2 as f32; - r2.total_cmp(&r1) // Reverse order so max values are at beginnibg - }); - self.print_links(&by_degrees); - - println!(); - println!("Most inlinks per non-zero outlink"); - println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯"); - - by_degrees.reverse(); - self.print_links(&by_degrees); - - Ok(()) - } - - fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) { - for (i, (page, od, id)) in by_degrees.iter().take(self.top).enumerate() { - println!( - "{:3}. {} ({} out, {} in)", - i + 1, - util::fmt_page(page), - od.separate_with_underscores(), - id.separate_with_underscores() - ); - } - } -} diff --git a/brood/src/commands/stats/redirects.rs b/brood/src/commands/stats/redirects.rs deleted file mode 100644 index 6bf2204..0000000 --- a/brood/src/commands/stats/redirects.rs +++ /dev/null @@ -1,107 +0,0 @@ -use std::{cmp::Reverse, collections::HashSet, io}; - -use thousands::Separable; - -use crate::{data::Data, graph::NodeIdx, util}; - -fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> { - let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new(); - - for node in data.graph.nodes() { - if !data.pages[node.usize()].redirect { - continue; - } - - let mut seen = HashSet::new(); - - let mut curr = node; - seen.insert(node); - - while let Some(next) = data.redirect_target(curr) { - if seen.contains(&next) { - println!(" Redirect loop: {}", data.pages[node.usize()].title); - break; - } - - curr = next; - seen.insert(next); - } - - redirects.push((node, curr, seen.len() - 1)); - } - - redirects -} - -fn follow_redirect(data: &Data, start: NodeIdx) -> Vec { - let mut seen = HashSet::new(); - let mut nodes = Vec::new(); - - let mut curr = start; - seen.insert(curr); - nodes.push(curr); - - while let Some(next) = data.redirect_target(curr) { - if seen.contains(&next) { - break; - } - - curr = next; - seen.insert(curr); - nodes.push(curr); - } - - nodes -} - -/// Show redirect stats. -#[derive(Debug, clap::Parser)] -pub struct Cmd { - /// Show more detailed info. - #[arg(long, short)] - long: bool, -} - -impl Cmd { - pub fn run(self, data: Data) -> io::Result<()> { - println!(">> Resolve redirects"); - let redirects = find_redirects(&data); - - println!( - "There is a total of {} redirects.", - redirects.len().separate_with_underscores() - ); - - let mut long = redirects - .iter() - .filter(|(_, _, l)| *l > 1) - .collect::>(); - long.sort_by_key(|(_, _, l)| Reverse(l)); - - println!( - "{} redirects take more than one step to reach an article.", - long.len().separate_with_underscores() - ); - - println!( - "The longest redirect chain takes {} steps.", - long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0), - ); - - println!("Though these redirect chains are usually swiftly fixed by bots."); - - if self.long { - println!(); - println!("Redirect chains with length > 1:"); - - for (start, _, _) in long { - println!(); - for step in follow_redirect(&data, *start) { - println!("{}", util::fmt_page(&data.pages[step.usize()])); - } - } - } - - Ok(()) - } -} diff --git a/brood/src/data.rs b/brood/src/data.rs index c253094..16aa0eb 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,218 +1,3 @@ -use std::{ - fs::File, - io::{self, BufReader, BufWriter, Read, Write}, - path::Path, -}; - -use crate::graph::{EdgeIdx, Graph, NodeIdx}; - -#[derive(Debug, Clone)] -pub struct Page { - pub id: u32, - pub title: String, - pub length: u32, - pub redirect: bool, -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct Link { - pub start: u32, - pub len: u32, - pub flags: u8, -} - -impl Link { - pub fn in_parens(self) -> bool { - self.flags & 0b1 != 0 - } - - pub fn in_structure(self) -> bool { - self.flags & 0b10 != 0 - } -} - -fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> { - w.write_all(&n.to_le_bytes()) -} - -fn read_u8(r: &mut impl Read) -> io::Result { - let mut buf = [0_u8; 1]; - r.read_exact(&mut buf)?; - Ok(u8::from_le_bytes(buf)) -} - -fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> { - w.write_all(&n.to_le_bytes()) -} - -fn read_u16(r: &mut impl Read) -> io::Result { - let mut buf = [0_u8; 2]; - r.read_exact(&mut buf)?; - Ok(u16::from_le_bytes(buf)) -} - -fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> { - w.write_all(&n.to_le_bytes()) -} - -fn read_u32(r: &mut impl Read) -> io::Result { - let mut buf = [0_u8; 4]; - r.read_exact(&mut buf)?; - Ok(u32::from_le_bytes(buf)) -} - -fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> { - assert!(s.len() <= u16::MAX as usize); - write_u16(w, s.len() as u16)?; - w.write_all(s.as_bytes())?; - Ok(()) -} - -fn read_str(r: &mut impl Read) -> io::Result { - let len = read_u16(r)? as usize; - let mut buf = vec![0_u8; len]; - r.read_exact(&mut buf)?; - Ok(String::from_utf8(buf).unwrap()) -} - -fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> { - write_u32(w, page.id)?; - write_u32(w, page.length)?; - write_u8(w, if page.redirect { 1 } else { 0 })?; - write_str(w, &page.title)?; - Ok(()) -} - -pub fn read_page(r: &mut impl Read) -> io::Result { - Ok(Page { - id: read_u32(r)?, - length: read_u32(r)?, - redirect: read_u8(r)? != 0, - title: read_str(r)?, - }) -} - -fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> { - write_u32(w, link.start)?; - write_u32(w, link.len)?; - write_u8(w, link.flags)?; - Ok(()) -} - -fn read_link(r: &mut impl Read) -> io::Result { - Ok(Link { - start: read_u32(r)?, - len: read_u32(r)?, - flags: read_u8(r)?, - }) -} - -#[derive(Default)] -pub struct Data { - pub pages: Vec, - pub links: Vec, - pub graph: Graph, -} - -impl Data { - pub fn new() -> Self { - Self::default() - } - - pub fn with_capacity(pages: usize, links: usize) -> Self { - Self { - pages: Vec::with_capacity(pages), - links: Vec::with_capacity(links), - graph: Graph::with_capacity(pages, links), - } - } - - fn write(&self, w: &mut impl Write) -> io::Result<()> { - assert!(self.pages.len() < u32::MAX as usize); - assert!(self.links.len() < u32::MAX as usize); - assert_eq!(self.pages.len(), self.graph.nodes.len()); - assert_eq!(self.links.len(), self.graph.edges.len()); - write_u32(w, self.pages.len() as u32)?; - write_u32(w, self.links.len() as u32)?; - - for page in &self.pages { - write_page(w, page)?; - } - - for link in &self.links { - write_link(w, link)?; - } - - for node in &self.graph.nodes { - write_u32(w, node.0)?; - } - - for edge in &self.graph.edges { - write_u32(w, edge.0)?; - } - - Ok(()) - } - - fn read(r: &mut impl Read) -> io::Result { - let n_pages = read_u32(r)?; - let n_links = read_u32(r)?; - - let mut result = Self::with_capacity(n_pages as usize, n_links as usize); - - for _ in 0..n_pages { - result.pages.push(read_page(r)?); - } - - for _ in 0..n_links { - result.links.push(read_link(r)?); - } - - for _ in 0..n_pages { - result.graph.nodes.push(EdgeIdx(read_u32(r)?)); - } - - for _ in 0..n_links { - result.graph.edges.push(NodeIdx(read_u32(r)?)); - } - - assert_eq!(result.pages.len(), result.graph.nodes.len()); - assert_eq!(result.links.len(), result.graph.edges.len()); - result.graph.check_consistency(); - Ok(result) - } - - pub fn write_to_file(&self, path: &Path) -> io::Result<()> { - let mut file = BufWriter::new(File::create(path)?); - self.write(&mut file) - } - - pub fn read_from_file(path: &Path) -> io::Result { - let mut file = BufReader::new(File::open(path)?); - Self::read(&mut file) - } - - pub fn check_consistency(&self) { - assert_eq!( - self.pages.len(), - self.graph.nodes.len(), - "inconsistent number of pages" - ); - - assert_eq!( - self.links.len(), - self.graph.edges.len(), - "inconsistent number of links" - ); - - self.graph.check_consistency(); - } - - pub fn redirect_target(&self, node: NodeIdx) -> Option { - if !self.pages[node.usize()].redirect { - return None; - } - - self.graph.edge_slice(node).first().copied() - } -} +pub mod adjacency_list; +pub mod info; +pub mod store; diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs new file mode 100644 index 0000000..4c07b3d --- /dev/null +++ b/brood/src/data/adjacency_list.rs @@ -0,0 +1,162 @@ +use std::ops::Range; + +use super::info::{LinkInfo, PageInfo}; + +#[derive(Debug, Clone, Copy)] +pub struct Page

{ + /// Index of the first link belonging to this page. + pub start: u32, + pub data: P, +} + +impl

Page

{ + pub fn change_data(self, f: impl Fn(P) -> P2) -> Page { + Page { + start: self.start, + data: f(self.data), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Link { + /// Index of the page this link points to. + pub to: u32, + pub data: L, +} + +impl Link { + pub fn change_data(self, f: impl Fn(L) -> L2) -> Link { + Link { + to: self.to, + data: f(self.data), + } + } +} + +pub struct AdjacencyList { + pub pages: Vec>, + pub links: Vec>, +} + +impl Default for AdjacencyList { + fn default() -> Self { + Self { + pages: Default::default(), + links: Default::default(), + } + } +} + +impl AdjacencyList { + pub fn push_page(&mut self, data: P) -> u32 { + self.pages.push(Page { + start: self.links.len() as u32, + data, + }); + self.pages.len() as u32 - 1 + } + + pub fn push_link(&mut self, to: u32, data: L) -> u32 { + self.links.push(Link { to, data }); + self.links.len() as u32 - 1 + } + + pub fn page(&self, page_idx: u32) -> &Page

{ + &self.pages[page_idx as usize] + } + + pub fn page_mut(&mut self, page_idx: u32) -> &mut Page

{ + &mut self.pages[page_idx as usize] + } + + pub fn pages(&self) -> impl Iterator)> { + self.pages.iter().enumerate().map(|(i, p)| (i as u32, p)) + } + + pub fn link(&self, link_idx: u32) -> &Link { + &self.links[link_idx as usize] + } + + pub fn link_mut(&mut self, link_idx: u32) -> &mut Link { + &mut self.links[link_idx as usize] + } + + pub fn link_range(&self, page_idx: u32) -> Range { + let start_idx = self.pages[page_idx as usize].start; + let end_idx = match self.pages.get(page_idx as usize + 1) { + Some(page) => page.start, + None => self.links.len() as u32, + }; + start_idx..end_idx + } + + pub fn link_redirect(&self, page_idx: u32) -> Option { + let range = self.link_range(page_idx); + if range.is_empty() { + None + } else { + Some(range.start) + } + } + + pub fn links(&self, page_idx: u32) -> impl Iterator)> { + self.link_range(page_idx).map(|i| (i, self.link(i))) + } + + pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { + let pages = self + .pages + .into_iter() + .map(|p| p.change_data(page_f)) + .collect::>(); + + AdjacencyList { + pages, + links: self.links, + } + } + + pub fn change_link_data(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList { + let links = self + .links + .into_iter() + .map(|l| l.change_data(link_f)) + .collect::>(); + + AdjacencyList { + pages: self.pages, + links, + } + } +} + +impl AdjacencyList { + pub fn check_consistency(&self) { + // Check that all types are large enough + assert!(self.pages.len() < u32::MAX as usize, "too many pages"); + assert!(self.links.len() < u32::MAX as usize, "too many links"); + for page in &self.pages { + assert!( + page.data.title.len() <= u8::MAX as usize, + "page title too long" + ); + } + + // Check that all links contain valid indices. Links must not link to + // the sentinel page. + let range = 0..self.pages.len() as u32; + for link in &self.links { + assert!(range.contains(&link.to), "invalid link"); + } + + // Check that all redirect pages have at most one link + for (page_idx, page) in self.pages.iter().enumerate() { + if page.data.redirect { + let range = self.link_range(page_idx as u32); + let amount = range.end - range.start; + assert!(amount <= 1, "too many redirect links"); + } + } + } +} diff --git a/brood/src/data/info.rs b/brood/src/data/info.rs new file mode 100644 index 0000000..dad04d4 --- /dev/null +++ b/brood/src/data/info.rs @@ -0,0 +1,24 @@ +#[derive(Debug, Clone)] +pub struct PageInfo { + pub id: u32, + pub title: String, + pub length: u32, + pub redirect: bool, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct LinkInfo { + pub start: u32, + pub len: u32, + pub flags: u8, +} + +impl LinkInfo { + pub fn in_parens(self) -> bool { + self.flags & 0b1 != 0 + } + + pub fn in_structure(self) -> bool { + self.flags & 0b10 != 0 + } +} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs new file mode 100644 index 0000000..afba1a3 --- /dev/null +++ b/brood/src/data/store.rs @@ -0,0 +1,134 @@ +use std::io::{self, Read, Write}; + +use super::{ + adjacency_list::{AdjacencyList, Link, Page}, + info::{LinkInfo, PageInfo}, +}; + +fn write_u8(n: u8, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u8(from: &mut R) -> io::Result { + let mut buf = [0_u8; 1]; + from.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) +} + +fn write_u16(n: u16, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u16(from: &mut R) -> io::Result { + let mut buf = [0_u8; 2]; + from.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) +} + +fn write_u32(n: u32, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u32(from: &mut R) -> io::Result { + let mut buf = [0_u8; 4]; + from.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn write_str(s: &str, to: &mut W) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(s.len() as u16, to)?; + to.write_all(s.as_bytes())?; + Ok(()) +} + +fn read_str(from: &mut R) -> io::Result { + let len = read_u16(from)? as usize; + let mut buf = vec![0_u8; len]; + from.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) +} + +fn write_page(page: &Page, to: &mut W) -> io::Result<()> { + write_u32(page.start, to)?; + write_u32(page.data.id, to)?; + write_u32(page.data.length, to)?; + write_u8(if page.data.redirect { 1 } else { 0 }, to)?; + write_str(&page.data.title, to)?; + + Ok(()) +} + +pub fn read_page(from: &mut R) -> io::Result> { + let start_link_idx = read_u32(from)?; + let id = read_u32(from)?; + let length = read_u32(from)?; + let redirect = read_u8(from)? != 0; + let title = read_str(from)?; + + Ok(Page { + start: start_link_idx, + data: PageInfo { + id, + length, + redirect, + title, + }, + }) +} + +fn write_link(link: &Link, to: &mut W) -> io::Result<()> { + write_u32(link.to, to)?; + write_u32(link.data.start, to)?; + write_u32(link.data.len, to)?; + write_u8(link.data.flags, to)?; + + Ok(()) +} + +fn read_link(from: &mut R) -> io::Result> { + let to_page_idx = read_u32(from)?; + let start = read_u32(from)?; + let len = read_u32(from)?; + let flags = read_u8(from)?; + + Ok(Link { + to: to_page_idx, + data: LinkInfo { start, len, flags }, + }) +} + +pub fn write_adjacency_list( + al: &AdjacencyList, + to: &mut W, +) -> io::Result<()> { + write_u32(al.pages.len() as u32, to)?; + write_u32(al.links.len() as u32, to)?; + + for page in &al.pages { + write_page(page, to)?; + } + + for link in &al.links { + write_link(link, to)?; + } + + Ok(()) +} + +pub fn read_adjacency_list(from: &mut R) -> io::Result> { + let n_pages = read_u32(from)?; + let n_links = read_u32(from)?; + + let mut pages = vec![]; + for _ in 0..n_pages { + pages.push(read_page(from)?); + } + + let mut links = vec![]; + for _ in 0..n_links { + links.push(read_link(from)?); + } + + Ok(AdjacencyList { pages, links }) +} diff --git a/brood/src/graph.rs b/brood/src/graph.rs deleted file mode 100644 index a869300..0000000 --- a/brood/src/graph.rs +++ /dev/null @@ -1,295 +0,0 @@ -use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct NodeIdx(pub u32); - -impl NodeIdx { - pub const NONE: Self = Self(u32::MAX); - - #[inline] - pub const fn new(value: usize) -> Self { - Self(value as u32) - } - - #[inline] - pub const fn usize(self) -> usize { - self.0 as usize - } -} - -impl From for NodeIdx { - fn from(value: u32) -> Self { - Self(value) - } -} - -impl From for NodeIdx { - fn from(value: usize) -> Self { - Self::new(value) - } -} - -impl Add for NodeIdx { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self(self.0 + rhs.0) - } -} - -impl AddAssign for NodeIdx { - fn add_assign(&mut self, rhs: Self) { - self.0 += rhs.0; - } -} - -impl Sub for NodeIdx { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self(self.0 - rhs.0) - } -} - -impl SubAssign for NodeIdx { - fn sub_assign(&mut self, rhs: Self) { - self.0 -= rhs.0; - } -} - -impl Add for NodeIdx { - type Output = Self; - - fn add(self, rhs: u32) -> Self::Output { - Self(self.0 + rhs) - } -} - -impl AddAssign for NodeIdx { - fn add_assign(&mut self, rhs: u32) { - self.0 += rhs; - } -} - -impl Sub for NodeIdx { - type Output = Self; - - fn sub(self, rhs: u32) -> Self::Output { - Self(self.0 - rhs) - } -} - -impl SubAssign for NodeIdx { - fn sub_assign(&mut self, rhs: u32) { - self.0 -= rhs; - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct EdgeIdx(pub u32); - -impl EdgeIdx { - #[inline] - pub const fn new(value: usize) -> Self { - Self(value as u32) - } - - #[inline] - pub const fn usize(self) -> usize { - self.0 as usize - } -} - -impl From for EdgeIdx { - fn from(value: u32) -> Self { - Self(value) - } -} - -impl From for EdgeIdx { - fn from(value: usize) -> Self { - Self::new(value) - } -} - -impl Add for EdgeIdx { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - Self(self.0 + rhs.0) - } -} - -impl AddAssign for EdgeIdx { - fn add_assign(&mut self, rhs: Self) { - self.0 += rhs.0; - } -} - -impl Sub for EdgeIdx { - type Output = Self; - - fn sub(self, rhs: Self) -> Self::Output { - Self(self.0 - rhs.0) - } -} - -impl SubAssign for EdgeIdx { - fn sub_assign(&mut self, rhs: Self) { - self.0 -= rhs.0; - } -} - -impl Add for EdgeIdx { - type Output = Self; - - fn add(self, rhs: u32) -> Self::Output { - Self(self.0 + rhs) - } -} - -impl AddAssign for EdgeIdx { - fn add_assign(&mut self, rhs: u32) { - self.0 += rhs; - } -} - -impl Sub for EdgeIdx { - type Output = Self; - - fn sub(self, rhs: u32) -> Self::Output { - Self(self.0 - rhs) - } -} - -impl SubAssign for EdgeIdx { - fn sub_assign(&mut self, rhs: u32) { - self.0 -= rhs; - } -} - -#[derive(Default)] -pub struct Graph { - /// A node points to the first of its edges. - /// - /// A special case is that if the subsequent node points to the same edge, - /// the current node has no edges. - pub nodes: Vec, - - /// An edge points to a target node. - /// - /// The source node is defined implicitly by the graph data structure. - pub edges: Vec, -} - -impl Graph { - pub fn with_capacity(nodes: usize, edges: usize) -> Self { - Self { - nodes: Vec::with_capacity(nodes), - edges: Vec::with_capacity(edges), - } - } - - pub fn add_node(&mut self) { - self.nodes.push(EdgeIdx::new(self.edges.len())); - } - - pub fn add_edge(&mut self, target: NodeIdx) { - self.edges.push(target); - } - - pub fn check_consistency(&self) { - if self.nodes.is_empty() { - assert!(self.edges.is_empty(), "edges must belong to existing nodes"); - return; - } - - assert!(self.nodes.len() < u32::MAX as usize, "too many nodes"); - assert!(self.edges.len() < u32::MAX as usize, "too many edges"); - - assert_eq!( - *self.nodes.first().unwrap(), - EdgeIdx(0), - "first node pointer must be 0" - ); - - for (ni, node) in self.nodes.iter().cloned().enumerate() { - assert!( - node.usize() <= self.edges.len(), - "node pointers must be in range" - ); - - if let Some(succ) = self.nodes.get(ni + 1) { - assert!(node <= *succ, "node pointers must be well-ordered"); - } - } - - for edge in &self.edges { - assert!( - edge.usize() < self.nodes.len(), - "edge pointers must be in range" - ); - } - } - - pub fn nodes(&self) -> impl Iterator + '_ { - (0..self.nodes.len()).map(NodeIdx::new) - } - - pub fn edges(&self) -> impl Iterator + '_ { - Edges::new(self) - } - - pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { - self.nodes - .get(node.usize()) - .copied() - .unwrap_or_else(|| self.edges.len().into()) - } - - pub fn edge_range(&self, node: NodeIdx) -> Range { - let start = self.nodes[node.usize()]; - let end = self.edge_start(node + 1); - start.usize()..end.usize() - } - - pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] { - &self.edges[self.edge_range(node)] - } -} - -struct Edges<'a> { - graph: &'a Graph, - ni: NodeIdx, - ei: EdgeIdx, -} - -impl<'a> Edges<'a> { - fn new(graph: &'a Graph) -> Self { - Self { - graph, - ni: NodeIdx(0), - ei: EdgeIdx(0), - } - } -} - -impl Iterator for Edges<'_> { - type Item = (NodeIdx, NodeIdx); - - fn next(&mut self) -> Option { - if self.ei.usize() >= self.graph.edges.len() { - return None; - } - let target = self.graph.edges[self.ei.usize()]; - - // if would not be sufficient because some nodes may not have any edges. - while self.ei >= self.graph.edge_start(self.ni + 1) { - self.ni += 1; - } - let source = self.ni; - - self.ei += 1; - Some((source, target)) - } -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 270aee8..d31076c 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,23 +1,51 @@ -mod algo; -mod commands; +pub mod commands; mod data; -mod graph; mod util; -use std::{io, path::PathBuf}; +use std::io; +use std::path::PathBuf; use clap::Parser; -use data::Data; + +#[derive(Debug, PartialEq, Eq, Parser)] +pub enum PhilosophyGameCmd { + First, + Canonical, + Cluster, + Trace { start: String }, +} #[derive(Debug, Parser)] enum Command { - Ingest(commands::ingest::Cmd), - Export(commands::export::Cmd), - Show(commands::show::Cmd), - Stats(commands::stats::Cmd), - Path(commands::path::Cmd), - LongestPath(commands::longest_path::Cmd), - Pg(commands::pg::Cmd), + /// Read sift data on stdin and output brood data. + Ingest, + /// Read and reexport brood data. + Reexport { + to: PathBuf, + #[arg(long, short = 'P')] + in_parens: Option, + #[arg(long, short = 'S')] + in_structure: Option, + #[arg(long, short = 'F')] + filter: Option, + }, + /// Find a path from one article to another. + Path { + from: String, + to: String, + /// Flip start and end article. + #[arg(short, long)] + flip: bool, + }, + /// Find the longest shortest path starting at an article. + LongestShortestPath { from: String }, + /// Analyze articles using "Philosophy Game" rules. + PhilosophyGame { + #[command(subcommand)] + subcmd: PhilosophyGameCmd, + }, + /// Print all page titles. + ListPages, } #[derive(Debug, Parser)] @@ -25,59 +53,31 @@ struct Args { datafile: PathBuf, #[command(subcommand)] command: Command, - #[arg(long, short = 'P')] - in_parens: Option, - #[arg(long, short = 'S')] - in_structure: Option, - #[arg(long, short = 'R')] - resolve_redirects: bool, - #[arg(long, short = 'I')] - invert_edges: bool, - #[arg(long, short)] - check_consistency: bool, } fn main() -> io::Result<()> { let args = Args::parse(); - - if let Command::Ingest(cmd) = &args.command { - return cmd.run(&args.datafile); - } - - println!(">> Import"); - println!("> Reading data"); - let mut data = Data::read_from_file(&args.datafile)?; - - if args.in_parens.is_some() || args.in_structure.is_some() { - println!("> Filtering edges"); - algo::retain_edges(&mut data, |link| { - args.in_parens.is_none_or(|b| b == link.in_parens()) - && args.in_structure.is_none_or(|b| b == link.in_structure()) - }); - } - - if args.resolve_redirects { - println!("> Resolving redirects"); - algo::resolve_redirects(&mut data); - } - - if args.invert_edges { - println!("> Inverting edges"); - algo::invert(&mut data); - } - - if args.check_consistency { - println!("> Checking consistencey"); - data.check_consistency(); - } - match args.command { - Command::Ingest(_) => unreachable!(), - Command::Export(cmd) => cmd.run(data), - Command::Show(cmd) => cmd.run(data), - Command::Stats(cmd) => cmd.run(data), - Command::Path(cmd) => cmd.run(data), - Command::LongestPath(cmd) => cmd.run(data), - Command::Pg(cmd) => cmd.run(data), + Command::Ingest => commands::ingest::ingest(&args.datafile), + Command::Reexport { + to, + in_parens, + in_structure, + filter, + } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure, filter), + Command::Path { from, to, flip } => { + if flip { + commands::path::path(&args.datafile, &to, &from) + } else { + commands::path::path(&args.datafile, &from, &to) + } + } + Command::LongestShortestPath { from } => { + commands::longest_shortest_path::run(&args.datafile, &from) + } + Command::PhilosophyGame { subcmd } => { + commands::philosophy_game::run(&args.datafile, subcmd) + } + Command::ListPages => commands::list_pages::run(&args.datafile), } } diff --git a/brood/src/util.rs b/brood/src/util.rs index cc6ee42..e1a64ff 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,160 +1,39 @@ -use std::{collections::HashSet, fmt}; - -use regex::Regex; - -use crate::{ - data::{Data, Page}, - graph::NodeIdx, +use crate::data::{ + adjacency_list::{AdjacencyList, Page}, + info::{LinkInfo, PageInfo}, }; -// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js -struct PhpCharToUpper(char); +pub fn normalize_link(link: &str) -> String { + let link = link.trim().replace(' ', "_"); -impl fmt::Display for PhpCharToUpper { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self.0 { - // Do something special, I guess - 'ᾀ' => write!(f, "ᾈ"), - 'ᾁ' => write!(f, "ᾉ"), - 'ᾂ' => write!(f, "ᾊ"), - 'ᾃ' => write!(f, "ᾋ"), - 'ᾄ' => write!(f, "ᾌ"), - 'ᾅ' => write!(f, "ᾍ"), - 'ᾆ' => write!(f, "ᾎ"), - 'ᾇ' => write!(f, "ᾏ"), - 'ᾐ' => write!(f, "ᾘ"), - 'ᾑ' => write!(f, "ᾙ"), - 'ᾒ' => write!(f, "ᾚ"), - 'ᾓ' => write!(f, "ᾛ"), - 'ᾔ' => write!(f, "ᾜ"), - 'ᾕ' => write!(f, "ᾝ"), - 'ᾖ' => write!(f, "ᾞ"), - 'ᾗ' => write!(f, "ᾟ"), - 'ᾠ' => write!(f, "ᾨ"), - 'ᾡ' => write!(f, "ᾩ"), - 'ᾢ' => write!(f, "ᾪ"), - 'ᾣ' => write!(f, "ᾫ"), - 'ᾤ' => write!(f, "ᾬ"), - 'ᾥ' => write!(f, "ᾭ"), - 'ᾦ' => write!(f, "ᾮ"), - 'ᾧ' => write!(f, "ᾯ"), - 'ᾳ' => write!(f, "ᾼ"), - 'ῃ' => write!(f, "ῌ"), - 'ῳ' => write!(f, "ῼ"), - - // Do not capitalize - 'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ' - | 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ' - | 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ' - | 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ' - | 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ' - | 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ' - | 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ' - | 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ' - | 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ' - | 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ' - | 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ' - | 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ' - | 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩' - | '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷' - | '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => { - write!(f, "{}", self.0) - } - - // Capitalize normally - c => write!(f, "{}", c.to_uppercase()), - } - } + // Make only first char lowercase + link.chars() + .next() + .iter() + .flat_map(|c| c.to_lowercase()) + .chain(link.chars().skip(1)) + .collect::() } -pub struct TitleNormalizer { - strip_bidi: Regex, - clean_up_whitespace: Regex, - trim_underscore_start: Regex, - trim_underscore_end: Regex, -} - -impl TitleNormalizer { - pub fn new() -> Self { - Self { - strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(), - - clean_up_whitespace: Regex::new(concat!( - "[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}", - "\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+" - )) - .unwrap(), - - trim_underscore_start: Regex::new("^_+").unwrap(), - - trim_underscore_end: Regex::new("_+$").unwrap(), - } - } - - /// Normalize an article title. - /// - /// See also . - pub fn normalize(&self, title: &str) -> String { - // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403 - - // Strip Unicode bidi override characters - let title = self.strip_bidi.replace_all(title, ""); - - // Clean up whitespace - let title = self.clean_up_whitespace.replace_all(&title, "_"); - - // Trim _ from beginning and end - let title = self.trim_underscore_start.replace_all(&title, ""); - let title = self.trim_underscore_end.replace_all(&title, ""); - - // https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206 - let Some(first) = title.chars().next() else { - return String::new(); - }; - let rest = &title[first.len_utf8()..]; - format!("{}{rest}", PhpCharToUpper(first)) - } -} - -pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { - let normalized = normalizer.normalize(title); - data.pages +pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { + let title = normalize_link(title); + pages .iter() .enumerate() - .find(|(_, p)| normalizer.normalize(&p.title) == normalized) - .map(|(i, _)| NodeIdx::new(i)) - .expect("invalid title") + .find(|(_, p)| normalize_link(&p.data.title) == title) + .map(|(i, _)| i) + .expect("invalid title") as u32 } -pub fn resolve_redirects(data: &Data, node: NodeIdx) -> NodeIdx { - let mut curr = node; - let mut seen = HashSet::new(); - - seen.insert(curr); - while let Some(target) = data.redirect_target(curr) { - if seen.contains(&target) { - println!( - " Redirect cycle deteted: {:?}", - data.pages[node.usize()].title - ); - break; +pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: u32) -> u32 { + loop { + if data.page(page_idx).data.redirect { + if let Some(link_idx) = data.link_redirect(page_idx) { + page_idx = data.link(link_idx).to; + continue; + } } - seen.insert(target); - curr = target; - } - - curr -} - -pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { - resolve_redirects(data, locate_title(normalizer, data, title)) -} - -pub fn fmt_page(page: &Page) -> String { - if page.redirect { - format!("v {}", page.title) - } else { - format!("- {}", page.title) + return page_idx; } } diff --git a/sift/sift.py b/sift/sift.py index 2562fa2..bde2e74 100644 --- a/sift/sift.py +++ b/sift/sift.py @@ -172,21 +172,16 @@ def process_xmldump_page(page): # Page info as simple tuples def simple_pages(input): dump = mwxml.Dump.from_file(sys.stdin) - articles = 0 for i, page in enumerate(dump.pages): - if (i + 1) % 1000 == 0: - # Yeah, the articles are usually off by one - eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}") - if page.namespace != 0: continue - articles += 1 + if (i + 1) % 1000 == 0: + eprint(f"{i+1:8} pages, at pid {page.id:8}") + [revision] = list(page) # Every page has exactly one revision yield page.id, page.title, revision.text or "", page.redirect - eprint(f"{articles} articles total") - def process_simple_page(info): pid, title, text, redirect = info