Compare commits
1 commit
master
...
old-filter
| Author | SHA1 | Date | |
|---|---|---|---|
| d85b61d419 |
27 changed files with 1326 additions and 2220 deletions
327
brood/Cargo.lock
generated
327
brood/Cargo.lock
generated
|
|
@ -1,21 +1,12 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.18"
|
||||
version = "0.6.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
|
||||
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
|
|
@ -28,33 +19,33 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.10"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
||||
checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.6"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
||||
checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.2"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
||||
checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
|
||||
dependencies = [
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.6"
|
||||
version = "3.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
||||
checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys",
|
||||
|
|
@ -65,30 +56,16 @@ name = "brood"
|
|||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"indicatif",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thousands",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.23"
|
||||
version = "4.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
|
||||
checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
|
|
@ -96,9 +73,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.23"
|
||||
version = "4.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
|
||||
checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
|
|
@ -108,9 +85,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.5.18"
|
||||
version = "4.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
|
|
@ -120,34 +97,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.7.4"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
|
||||
checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.3"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.15.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
|
||||
dependencies = [
|
||||
"encode_unicode",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"unicode-width",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
|
|
@ -155,123 +113,41 @@ version = "0.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.17.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
|
||||
dependencies = [
|
||||
"console",
|
||||
"number_prefix",
|
||||
"portable-atomic",
|
||||
"unicode-width",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
version = "1.70.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.14"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.76"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.169"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.92"
|
||||
version = "1.0.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
|
||||
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.38"
|
||||
version = "1.0.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
|
||||
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.1"
|
||||
name = "rustc-hash"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
|
|
@ -281,18 +157,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
|||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.217"
|
||||
version = "1.0.203"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
||||
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.217"
|
||||
version = "1.0.203"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
||||
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
|
@ -301,12 +177,11 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.134"
|
||||
version = "1.0.118"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
|
||||
checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
|
@ -319,32 +194,20 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.93"
|
||||
version = "2.0.68"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058"
|
||||
checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thousands"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.14"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
|
|
@ -352,84 +215,20 @@ version = "0.2.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"wasm-bindgen-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
|
|
@ -443,48 +242,48 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
version = "0.52.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
|
||||
|
|
|
|||
|
|
@ -4,9 +4,7 @@ version = "0.0.0"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
|
||||
indicatif = "0.17.9"
|
||||
regex = "1.11.1"
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
serde_json = "1.0.134"
|
||||
thousands = "0.2.0"
|
||||
clap = { version = "4.5.7", features = ["derive", "deprecated"] }
|
||||
rustc-hash = "2.0.0"
|
||||
serde = { version = "1.0.203", features = ["derive"] }
|
||||
serde_json = "1.0.118"
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
mod dijkstra;
|
||||
mod edit;
|
||||
|
||||
pub use self::{dijkstra::*, edit::*};
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
use std::{cmp::Reverse, collections::BinaryHeap};
|
||||
|
||||
use crate::graph::{EdgeIdx, Graph, NodeIdx};
|
||||
|
||||
pub struct Dijkstra<'a> {
|
||||
graph: &'a Graph,
|
||||
cost: Vec<u32>,
|
||||
pred: Vec<NodeIdx>,
|
||||
}
|
||||
|
||||
impl<'a> Dijkstra<'a> {
|
||||
pub fn new(graph: &'a Graph) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
cost: vec![u32::MAX; graph.nodes.len()],
|
||||
pred: vec![NodeIdx::NONE; graph.nodes.len()],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(
|
||||
&mut self,
|
||||
start: NodeIdx,
|
||||
goal: impl Fn(NodeIdx) -> bool,
|
||||
cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32,
|
||||
) {
|
||||
self.cost[start.usize()] = 0;
|
||||
let mut queue = BinaryHeap::new();
|
||||
queue.push((Reverse(0), start));
|
||||
|
||||
while let Some((Reverse(curr_cost), curr)) = queue.pop() {
|
||||
if goal(curr) {
|
||||
break; // We've found the shortest path to our target
|
||||
}
|
||||
|
||||
// These seem to never actually occur
|
||||
// if curr_cost > self.cost[curr.usize()] {
|
||||
// continue; // Outdated entry
|
||||
// }
|
||||
|
||||
for edge in self.graph.edge_range(curr).map(EdgeIdx::new) {
|
||||
let next = self.graph.edges[edge.usize()];
|
||||
let next_cost = curr_cost + cost(curr, edge, next);
|
||||
if next_cost < self.cost[next.usize()] {
|
||||
self.cost[next.usize()] = next_cost;
|
||||
self.pred[next.usize()] = curr;
|
||||
queue.push((Reverse(next_cost), next));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn cost(&self, node: NodeIdx) -> u32 {
|
||||
self.cost[node.usize()]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn pred(&self, node: NodeIdx) -> NodeIdx {
|
||||
self.pred[node.usize()]
|
||||
}
|
||||
|
||||
pub fn path(&self, goal: NodeIdx) -> Vec<NodeIdx> {
|
||||
let mut path = vec![];
|
||||
let mut at = goal;
|
||||
|
||||
loop {
|
||||
path.push(at);
|
||||
at = self.pred(at);
|
||||
if at == NodeIdx::NONE {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
path.reverse();
|
||||
path
|
||||
}
|
||||
}
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
use std::mem;
|
||||
|
||||
use crate::{
|
||||
data::{Data, Link},
|
||||
graph::NodeIdx,
|
||||
util,
|
||||
};
|
||||
|
||||
pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) {
|
||||
let mut links = mem::take(&mut data.links).into_iter();
|
||||
let graph = mem::take(&mut data.graph);
|
||||
|
||||
for node in graph.nodes() {
|
||||
data.graph.add_node();
|
||||
|
||||
for edge in graph.edge_slice(node) {
|
||||
let link = links.next().unwrap();
|
||||
if f(&link) {
|
||||
data.links.push(link);
|
||||
data.graph.add_edge(*edge);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn resolve_redirects(data: &mut Data) {
|
||||
// Permutation from input node to input node
|
||||
let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()];
|
||||
for node in data.graph.nodes() {
|
||||
perm_redirect[node.usize()] = util::resolve_redirects(data, node);
|
||||
}
|
||||
|
||||
// Permutation from input node to final node
|
||||
let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()];
|
||||
let mut perm_retain_count = NodeIdx(0);
|
||||
for (i, page) in data.pages.iter().enumerate() {
|
||||
if !page.redirect {
|
||||
perm_retain[i] = perm_retain_count;
|
||||
perm_retain_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let mut pages = mem::take(&mut data.pages).into_iter();
|
||||
let mut links = mem::take(&mut data.links).into_iter();
|
||||
let graph = mem::take(&mut data.graph);
|
||||
|
||||
for node in graph.nodes() {
|
||||
let page = pages.next().unwrap();
|
||||
let new_node = perm_retain[node.usize()];
|
||||
|
||||
if new_node == NodeIdx::NONE {
|
||||
// Skip all edges
|
||||
for _ in graph.edge_slice(node) {
|
||||
links.next().unwrap();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
data.pages.push(page);
|
||||
data.graph.add_node();
|
||||
|
||||
for edge in graph.edge_slice(node) {
|
||||
let link = links.next().unwrap();
|
||||
let new_edge = perm_retain[perm_redirect[edge.usize()].usize()];
|
||||
|
||||
if new_edge == NodeIdx::NONE {
|
||||
continue;
|
||||
}
|
||||
|
||||
data.links.push(link);
|
||||
data.graph.add_edge(new_edge);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn invert(data: &mut Data) {
|
||||
let links = mem::take(&mut data.links);
|
||||
let graph = mem::take(&mut data.graph);
|
||||
|
||||
let mut edges = graph
|
||||
.edges()
|
||||
.zip(links)
|
||||
.map(|((source, target), link)| (source, target, link))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
edges.sort_by_key(|(_, target, _)| *target);
|
||||
|
||||
let mut edges = edges.into_iter().peekable();
|
||||
for node in graph.nodes() {
|
||||
data.graph.add_node();
|
||||
while edges.peek().is_some_and(|(_, target, _)| *target <= node) {
|
||||
let (source, _, link) = edges.next().unwrap();
|
||||
data.graph.add_edge(source);
|
||||
data.links.push(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
pub mod export;
|
||||
pub mod ingest;
|
||||
pub mod longest_path;
|
||||
pub mod list_pages;
|
||||
pub mod longest_shortest_path;
|
||||
pub mod path;
|
||||
pub mod pg;
|
||||
pub mod show;
|
||||
pub mod stats;
|
||||
pub mod philosophy_game;
|
||||
pub mod reexport;
|
||||
|
|
|
|||
|
|
@ -1,17 +0,0 @@
|
|||
use std::{io, path::PathBuf};
|
||||
|
||||
use crate::data::Data;
|
||||
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
out: PathBuf,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
println!(">> Export");
|
||||
data.write_to_file(&self.out)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +1,16 @@
|
|||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
fs::File,
|
||||
io::{self, BufRead, BufReader, Seek},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
use std::u32;
|
||||
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::Deserialize;
|
||||
use thousands::Separable;
|
||||
|
||||
use crate::{
|
||||
data::{Data, Link, Page},
|
||||
graph::NodeIdx,
|
||||
util::TitleNormalizer,
|
||||
};
|
||||
|
||||
const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ ";
|
||||
|
||||
fn seek_to_start(f: &mut BufReader<File>) -> io::Result<u64> {
|
||||
let size = f.seek(io::SeekFrom::End(0))?;
|
||||
f.seek(io::SeekFrom::Start(0))?;
|
||||
Ok(size)
|
||||
}
|
||||
|
||||
fn file_progress_style() -> ProgressStyle {
|
||||
ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}")
|
||||
.unwrap()
|
||||
.progress_chars(PROGRESS_CHARS)
|
||||
}
|
||||
use crate::data::adjacency_list::{AdjacencyList, Page};
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct JsonPage {
|
||||
|
|
@ -38,161 +21,151 @@ struct JsonPage {
|
|||
redirect: Option<String>,
|
||||
}
|
||||
|
||||
fn read_titles(f: &mut BufReader<File>) -> io::Result<Vec<String>> {
|
||||
let size = seek_to_start(f)?;
|
||||
let bar = ProgressBar::new(size).with_style(file_progress_style());
|
||||
/*
|
||||
Importing is a tad complicated because of multiple criteria:
|
||||
|
||||
let mut titles = vec![];
|
||||
1. The data must be read in a single pass on stdin
|
||||
2. The process should not consume a lot of memory
|
||||
(can't store the decoded json data directly)
|
||||
3. The process should result in a nice and compact adjacency list format
|
||||
|
||||
for line in bar.wrap_read(f).lines() {
|
||||
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||
titles.push(page.title);
|
||||
}
|
||||
Because of this, the import is a bit more complex and has two passes.
|
||||
|
||||
Ok(titles)
|
||||
The first pass imports the data into an adjacency-list-like format, but the
|
||||
`Link::to` field points to a title in `Titles` instead of a page.
|
||||
|
||||
The second pass then resolves the links to page indices and throws away all
|
||||
links that don't point to any known page.
|
||||
*/
|
||||
|
||||
#[derive(Default)]
|
||||
struct Titles {
|
||||
/// Normalized titles
|
||||
titles: Vec<String>,
|
||||
/// Map from normalized title to index in [`Self::titles`].
|
||||
map: FxHashMap<String, u32>,
|
||||
}
|
||||
|
||||
/// Returns a map from normalized title to the index in the brood data where the
|
||||
/// article will appear.
|
||||
///
|
||||
/// Titles in the title list are not always unique. When multiple identical
|
||||
/// titles appear, all but one have to be discarded. Originally, I tried to be
|
||||
/// smart and keep the last occurrence (under the assumption that its data would
|
||||
/// be the newest), but this led to index-based bugs. Because of this, I now
|
||||
/// keep the first occurrence.
|
||||
fn compute_title_lookup(
|
||||
normalizer: &TitleNormalizer,
|
||||
titles: &[String],
|
||||
) -> HashMap<String, (u32, u32)> {
|
||||
let mut title_lookup = HashMap::<String, (u32, u32)>::new();
|
||||
|
||||
let bar = ProgressBar::new(titles.len() as u64)
|
||||
.with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS));
|
||||
|
||||
for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() {
|
||||
// The index where this article will appear in the final list, assuming
|
||||
// it is not a duplicate. For ownership reasons, we compute this here
|
||||
// instead of inside the Entry::Vacant branch of the following match.
|
||||
let brood_i = title_lookup.len();
|
||||
|
||||
match title_lookup.entry(normalizer.normalize(title)) {
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert((sift_i as u32, brood_i as u32));
|
||||
}
|
||||
Entry::Occupied(entry) => {
|
||||
let prev_sift_i = entry.get().0;
|
||||
let prev = &titles[prev_sift_i as usize];
|
||||
if prev == title {
|
||||
bar.println(format!(
|
||||
" {title:?} ({prev_sift_i}) occurs again at {sift_i}"
|
||||
));
|
||||
} else {
|
||||
bar.println(format!(
|
||||
" {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}",
|
||||
normalizer.normalize(title)
|
||||
));
|
||||
}
|
||||
impl Titles {
|
||||
fn insert(&mut self, title: String) -> u32 {
|
||||
match self.map.entry(title.clone()) {
|
||||
Entry::Occupied(occupied) => *occupied.get(),
|
||||
Entry::Vacant(vacant) => {
|
||||
let idx = self.titles.len() as u32;
|
||||
self.titles.push(title);
|
||||
vacant.insert(idx);
|
||||
idx
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
title_lookup
|
||||
fn get(&self, i: u32) -> &str {
|
||||
&self.titles[i as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn read_page_data(
|
||||
normalizer: &TitleNormalizer,
|
||||
title_lookup: &HashMap<String, (u32, u32)>,
|
||||
f: &mut BufReader<File>,
|
||||
) -> io::Result<Data> {
|
||||
let size = seek_to_start(f)?;
|
||||
let bar = ProgressBar::new(size).with_style(file_progress_style());
|
||||
fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
||||
let mut titles = Titles::default();
|
||||
let mut result = AdjacencyList::default();
|
||||
|
||||
let mut data = Data::new();
|
||||
let stdin = BufReader::new(io::stdin());
|
||||
for (i, line) in stdin.lines().enumerate() {
|
||||
let json_page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||
|
||||
for (i, line) in bar.wrap_read(f).lines().enumerate() {
|
||||
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||
let normalized = normalizer.normalize(&page.title);
|
||||
|
||||
let (sift_i, _) = title_lookup[&normalized];
|
||||
if i as u32 != sift_i {
|
||||
// Articles may occur multiple times, and this is not the instance
|
||||
// of the article we should keep.
|
||||
bar.println(format!(
|
||||
" Skipping {:?} ({i}) in favor of {sift_i}",
|
||||
page.title
|
||||
));
|
||||
continue;
|
||||
}
|
||||
|
||||
data.graph.add_node();
|
||||
data.pages.push(Page {
|
||||
id: page.id,
|
||||
title: page.title,
|
||||
length: page.length,
|
||||
redirect: page.redirect.is_some(),
|
||||
result.push_page(PageInfo {
|
||||
id: json_page.id,
|
||||
length: json_page.length,
|
||||
redirect: json_page.redirect.is_some(),
|
||||
title: json_page.title,
|
||||
});
|
||||
|
||||
let mut page_links = page.links;
|
||||
if let Some(target) = page.redirect {
|
||||
page_links.clear();
|
||||
let len = target.len() as u32;
|
||||
page_links.push((target, 0, len, 0));
|
||||
if let Some(to) = json_page.redirect {
|
||||
let to = titles.insert(util::normalize_link(&to));
|
||||
result.push_link(to, LinkInfo::default());
|
||||
} else {
|
||||
for (to, start, len, flags) in json_page.links {
|
||||
let to = titles.insert(util::normalize_link(&to));
|
||||
result.push_link(to, LinkInfo { start, len, flags });
|
||||
}
|
||||
}
|
||||
|
||||
for (target, start, len, flags) in page_links {
|
||||
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
|
||||
data.graph.add_edge(NodeIdx(*brood_i));
|
||||
data.links.push(Link { start, len, flags });
|
||||
if (i + 1) % 100_000 == 0 {
|
||||
eprintln!("{} pages imported", i + 1)
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Pages: {}", result.pages.len());
|
||||
eprintln!("Links: {}", result.links.len());
|
||||
eprintln!("Titles: {}", titles.titles.len());
|
||||
eprintln!("Title map entries: {}", titles.map.len());
|
||||
|
||||
Ok((result, titles))
|
||||
}
|
||||
|
||||
/// Create map from normalized title to index in pages.
|
||||
fn initialize_pages_map(pages: &[Page<PageInfo>]) -> FxHashMap<String, u32> {
|
||||
let mut result = FxHashMap::default();
|
||||
for (i, p) in pages.iter().enumerate() {
|
||||
match result.entry(util::normalize_link(&p.data.title)) {
|
||||
Entry::Occupied(entry) => {
|
||||
eprintln!(
|
||||
"{:?} already exists at index {} as {:?}",
|
||||
p.data.title,
|
||||
entry.get(),
|
||||
util::normalize_link(&p.data.title)
|
||||
);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(i as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(data)
|
||||
result
|
||||
}
|
||||
|
||||
/// Convert sift data to brood data.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
/// The sift data file to ingest.
|
||||
data: PathBuf,
|
||||
}
|
||||
fn second_stage(
|
||||
first_stage: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
titles: &Titles,
|
||||
) -> AdjacencyList<PageInfo, LinkInfo> {
|
||||
let pages_map = initialize_pages_map(&first_stage.pages);
|
||||
let mut result = AdjacencyList::default();
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(&self, brood_data: &Path) -> io::Result<()> {
|
||||
let normalizer = TitleNormalizer::new();
|
||||
for (page_idx, page) in first_stage.pages() {
|
||||
result.push_page(page.data.clone());
|
||||
|
||||
println!(">> First pass");
|
||||
let mut sift_data = BufReader::new(File::open(&self.data)?);
|
||||
for (_, link) in first_stage.links(page_idx) {
|
||||
let title = util::normalize_link(titles.get(link.to));
|
||||
if let Some(to) = pages_map.get(&title) {
|
||||
// The link points to an existing article, we should keep it
|
||||
result.push_link(*to, link.data);
|
||||
}
|
||||
}
|
||||
|
||||
println!("> Reading titles");
|
||||
let titles = read_titles(&mut sift_data)?;
|
||||
|
||||
println!("> Computing title index lookup table");
|
||||
let title_lookup = compute_title_lookup(&normalizer, &titles);
|
||||
drop(titles); // Don't hoard memory
|
||||
|
||||
println!(">> Second pass");
|
||||
|
||||
println!("> Reading page data");
|
||||
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
||||
assert_eq!(data.pages.len(), title_lookup.len());
|
||||
drop(title_lookup); // Don't hoard memory
|
||||
drop(sift_data); // No longer needed
|
||||
|
||||
println!("> Checking consistency");
|
||||
data.check_consistency();
|
||||
|
||||
println!(">> Export");
|
||||
println!(
|
||||
"Pages: {:>13}",
|
||||
data.pages.len().separate_with_underscores()
|
||||
);
|
||||
println!(
|
||||
"Links: {:>13}",
|
||||
data.links.len().separate_with_underscores()
|
||||
);
|
||||
data.write_to_file(brood_data)?;
|
||||
|
||||
Ok(())
|
||||
if (page_idx + 1) % 100_000 == 0 {
|
||||
eprintln!("{} pages imported", page_idx + 1)
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Pages: {}", result.pages.len());
|
||||
eprintln!("Links: {}", result.links.len());
|
||||
eprintln!("Page map entries: {}", pages_map.len());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn ingest(datafile: &Path) -> io::Result<()> {
|
||||
eprintln!(">> First stage");
|
||||
let (first_stage, titles) = first_stage()?;
|
||||
|
||||
eprintln!(">> Second stage");
|
||||
let data = second_stage(&first_stage, &titles);
|
||||
|
||||
eprintln!(">> Consistency check");
|
||||
data.check_consistency();
|
||||
|
||||
eprintln!(">> Export");
|
||||
let mut datafile = BufWriter::new(File::create(datafile)?);
|
||||
store::write_adjacency_list(&data, &mut datafile)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
23
brood/src/commands/list_pages.rs
Normal file
23
brood/src/commands/list_pages.rs
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::store;
|
||||
|
||||
pub fn run(datafile: &Path) -> io::Result<()> {
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
|
||||
for (page_idx, page) in data.pages() {
|
||||
if page.data.redirect {
|
||||
for link_idx in data.link_range(page_idx) {
|
||||
let target_page = data.page(data.link(link_idx).to);
|
||||
println!("{:?} -> {:?}", page.data.title, target_page.data.title);
|
||||
}
|
||||
} else {
|
||||
println!("{:?}", page.data.title);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
use std::io;
|
||||
|
||||
use crate::{
|
||||
algo::Dijkstra,
|
||||
data::Data,
|
||||
graph::NodeIdx,
|
||||
util::{self, TitleNormalizer},
|
||||
};
|
||||
|
||||
/// Find the article with the longest shortest path away from the starting
|
||||
/// article.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
start: String,
|
||||
#[arg(long, short, default_value_t = 1)]
|
||||
top: usize,
|
||||
}
|
||||
|
||||
fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec<NodeIdx>)>) {
|
||||
let start = &data.pages[start.usize()].title;
|
||||
let goal = &data.pages[goal.usize()].title;
|
||||
|
||||
let Some((cost, path)) = path else {
|
||||
println!("No path found from {start} to {goal}");
|
||||
return;
|
||||
};
|
||||
|
||||
println!("Path found (cost {cost}, length {}):", path.len());
|
||||
|
||||
for page in path {
|
||||
println!("{}", util::fmt_page(&data.pages[page.usize()]));
|
||||
}
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
let normalizer = TitleNormalizer::new();
|
||||
|
||||
println!(">> Resolve article");
|
||||
let start = util::resolve_title(&normalizer, &data, &self.start);
|
||||
println!("Start: {}", data.pages[start.usize()].title);
|
||||
|
||||
println!(">> Search paths");
|
||||
println!("> Preparing dijkstra");
|
||||
let mut dijkstra = Dijkstra::new(&data.graph);
|
||||
println!("> Running dijkstra");
|
||||
dijkstra.run(
|
||||
start,
|
||||
|_| false,
|
||||
|source, _edge, _target| !data.pages[source.usize()].redirect as u32,
|
||||
);
|
||||
|
||||
println!(">> Find longest paths");
|
||||
let mut costs = data
|
||||
.graph
|
||||
.nodes()
|
||||
.map(|n| (dijkstra.cost(n), n))
|
||||
.filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please
|
||||
.collect::<Vec<_>>();
|
||||
costs.sort_unstable();
|
||||
|
||||
for (cost, goal) in costs.iter().rev().take(self.top) {
|
||||
let path = dijkstra.path(*goal);
|
||||
println!();
|
||||
print_path(&data, start, *goal, Some((*cost, path)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
173
brood/src/commands/longest_shortest_path.rs
Normal file
173
brood/src/commands/longest_shortest_path.rs
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
use std::collections::BinaryHeap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::adjacency_list::AdjacencyList;
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
struct DijkstraPageInfo {
|
||||
cost: u32,
|
||||
/// Index of the previous page.
|
||||
prev: u32,
|
||||
redirect: bool,
|
||||
}
|
||||
|
||||
impl DijkstraPageInfo {
|
||||
fn from_page_info(info: PageInfo) -> Self {
|
||||
Self {
|
||||
cost: u32::MAX,
|
||||
prev: u32::MAX,
|
||||
redirect: info.redirect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DijkstraLinkInfo {
|
||||
cost: u32,
|
||||
}
|
||||
|
||||
impl DijkstraLinkInfo {
|
||||
fn from_link_info(info: LinkInfo) -> Self {
|
||||
Self {
|
||||
cost: 1,
|
||||
// cost: 1000 + info.start,
|
||||
// cost: 10000 + info.start,
|
||||
// cost: 1000 + info.start / 10,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct Entry {
|
||||
cost: u32,
|
||||
page_idx: u32,
|
||||
}
|
||||
|
||||
impl Entry {
|
||||
pub fn new(cost: u32, page_idx: u32) -> Self {
|
||||
Self { cost, page_idx }
|
||||
}
|
||||
}
|
||||
|
||||
// Manual implementation so the queue is a min-heap instead of a max-heap.
|
||||
impl Ord for Entry {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
other
|
||||
.cost
|
||||
.cmp(&self.cost)
|
||||
.then_with(|| self.page_idx.cmp(&other.page_idx))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Entry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Closely matches the dijkstra example in [std::collections::binary_heap].
|
||||
fn full_dijkstra(
|
||||
data: AdjacencyList<PageInfo, LinkInfo>,
|
||||
from: u32,
|
||||
) -> AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo> {
|
||||
println!("> Prepare state");
|
||||
let mut data = data
|
||||
.change_page_data(DijkstraPageInfo::from_page_info)
|
||||
.change_link_data(DijkstraLinkInfo::from_link_info);
|
||||
let mut queue = BinaryHeap::new();
|
||||
data.page_mut(from).data.cost = 0;
|
||||
queue.push(Entry::new(0, from));
|
||||
|
||||
println!("> Run dijkstra");
|
||||
while let Some(Entry { cost, page_idx }) = queue.pop() {
|
||||
let page = data.page(page_idx);
|
||||
if cost > page.data.cost {
|
||||
// This queue entry is outdated
|
||||
continue;
|
||||
}
|
||||
|
||||
let redirect = page.data.redirect;
|
||||
for link_idx in data.link_range(page_idx) {
|
||||
let link = data.link(link_idx);
|
||||
|
||||
let next = Entry {
|
||||
cost: cost + if redirect { 0 } else { link.data.cost },
|
||||
page_idx: link.to,
|
||||
};
|
||||
|
||||
let target_page = data.page_mut(link.to);
|
||||
if next.cost < target_page.data.cost {
|
||||
target_page.data.cost = next.cost;
|
||||
target_page.data.prev = page_idx;
|
||||
queue.push(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
fn find_longest_shortest_path(
|
||||
data: AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo>,
|
||||
from: u32,
|
||||
) -> Option<Vec<u32>> {
|
||||
let to = data
|
||||
.pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, p)| p.data.cost != u32::MAX)
|
||||
.max_by_key(|(_, p)| p.data.cost)?
|
||||
.0 as u32;
|
||||
|
||||
let mut steps = vec![];
|
||||
let mut at = to;
|
||||
loop {
|
||||
steps.push(at);
|
||||
at = data.page(at).data.prev;
|
||||
if at == u32::MAX {
|
||||
break;
|
||||
};
|
||||
}
|
||||
steps.reverse();
|
||||
if steps.first() == Some(&from) {
|
||||
Some(steps)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
|
||||
println!(">> Import");
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
let pages = data.pages.clone();
|
||||
|
||||
println!(">> Locate from and to");
|
||||
let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from));
|
||||
println!("From: {:?}", data.page(from_idx).data.title);
|
||||
|
||||
println!(">> Find all shortest paths");
|
||||
let data = full_dijkstra(data, from_idx);
|
||||
|
||||
println!(">> Find longest shortest path");
|
||||
let path = find_longest_shortest_path(data, from_idx);
|
||||
|
||||
if let Some(path) = path {
|
||||
println!("Path found:");
|
||||
for page_idx in path {
|
||||
let page = &pages[page_idx as usize];
|
||||
if page.data.redirect {
|
||||
println!(" v {:?}", page.data.title);
|
||||
} else {
|
||||
println!(" - {:?}", page.data.title);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("No path found");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1,87 +1,159 @@
|
|||
use std::io;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{
|
||||
algo::Dijkstra,
|
||||
data::Data,
|
||||
graph::NodeIdx,
|
||||
util::{self, TitleNormalizer},
|
||||
};
|
||||
use crate::data::adjacency_list::AdjacencyList;
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
/// Find the shortest path between two articles.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
start: String,
|
||||
goal: String,
|
||||
|
||||
// Search for a path in both directions.
|
||||
#[arg(long, short)]
|
||||
bidi: bool,
|
||||
struct DijkstraPageInfo {
|
||||
cost: u32,
|
||||
prev: u32,
|
||||
redirect: bool,
|
||||
}
|
||||
|
||||
fn search_path(data: &Data, start: NodeIdx, goal: NodeIdx) -> Option<(u32, Vec<NodeIdx>)> {
|
||||
println!("> Preparing dijkstra");
|
||||
let mut dijkstra = Dijkstra::new(&data.graph);
|
||||
println!("> Running dijkstra");
|
||||
dijkstra.run(
|
||||
start,
|
||||
|node| node == goal,
|
||||
|source, _edge, _target| !data.pages[source.usize()].redirect as u32,
|
||||
);
|
||||
|
||||
if dijkstra.cost(goal) == u32::MAX {
|
||||
return None;
|
||||
}
|
||||
|
||||
println!("> Collecting path");
|
||||
let cost = dijkstra.cost(goal);
|
||||
let path = dijkstra.path(goal);
|
||||
Some((cost, path))
|
||||
}
|
||||
|
||||
fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec<NodeIdx>)>) {
|
||||
let start = &data.pages[start.usize()].title;
|
||||
let goal = &data.pages[goal.usize()].title;
|
||||
|
||||
let Some((cost, path)) = path else {
|
||||
println!("No path found from {start} to {goal}");
|
||||
return;
|
||||
};
|
||||
|
||||
println!("Path found (cost {cost}, length {}):", path.len());
|
||||
|
||||
for page in path {
|
||||
println!("{}", util::fmt_page(&data.pages[page.usize()]));
|
||||
impl DijkstraPageInfo {
|
||||
fn from_page_info(info: PageInfo) -> Self {
|
||||
Self {
|
||||
cost: u32::MAX,
|
||||
prev: u32::MAX,
|
||||
redirect: info.redirect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
let normalizer = TitleNormalizer::new();
|
||||
struct DijkstraLinkInfo {
|
||||
cost: u32,
|
||||
}
|
||||
|
||||
println!(">> Resolve articles");
|
||||
let start = util::resolve_title(&normalizer, &data, &self.start);
|
||||
let goal = util::resolve_title(&normalizer, &data, &self.goal);
|
||||
println!("Start: {}", data.pages[start.usize()].title);
|
||||
println!("Goal: {}", data.pages[goal.usize()].title);
|
||||
impl DijkstraLinkInfo {
|
||||
fn from_link_info(info: LinkInfo) -> Self {
|
||||
Self {
|
||||
cost: 1,
|
||||
// cost: 1000 + info.start,
|
||||
// cost: 10000 + info.start,
|
||||
// cost: 1000 + info.start / 10,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.bidi {
|
||||
println!(">> Find path forward");
|
||||
let forward = search_path(&data, start, goal);
|
||||
println!(">> Find path backward");
|
||||
let backward = search_path(&data, goal, start);
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct Entry {
|
||||
cost: u32,
|
||||
page_idx: u32,
|
||||
}
|
||||
|
||||
println!();
|
||||
print_path(&data, start, goal, forward);
|
||||
println!();
|
||||
print_path(&data, goal, start, backward);
|
||||
} else {
|
||||
println!(">> Find path");
|
||||
let path = search_path(&data, start, goal);
|
||||
impl Entry {
|
||||
pub fn new(cost: u32, page_idx: u32) -> Self {
|
||||
Self { cost, page_idx }
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
print_path(&data, start, goal, path);
|
||||
// Manual implementation so the queue is a min-heap instead of a max-heap.
|
||||
impl Ord for Entry {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
other
|
||||
.cost
|
||||
.cmp(&self.cost)
|
||||
.then_with(|| self.page_idx.cmp(&other.page_idx))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Entry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Closely matches the dijkstra example in [std::collections::binary_heap].
|
||||
fn dijkstra(data: AdjacencyList<PageInfo, LinkInfo>, from: u32, to: u32) -> Option<Vec<u32>> {
|
||||
println!("> Prepare state");
|
||||
let mut data = data
|
||||
.change_page_data(DijkstraPageInfo::from_page_info)
|
||||
.change_link_data(DijkstraLinkInfo::from_link_info);
|
||||
let mut queue = BinaryHeap::new();
|
||||
data.page_mut(from).data.cost = 0;
|
||||
queue.push(Entry::new(0, from));
|
||||
|
||||
println!("> Run dijkstra");
|
||||
while let Some(Entry { cost, page_idx }) = queue.pop() {
|
||||
if page_idx == to {
|
||||
// We've found the shortest path to our target
|
||||
break;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let page = data.page(page_idx);
|
||||
if cost > page.data.cost {
|
||||
// This queue entry is outdated
|
||||
continue;
|
||||
}
|
||||
|
||||
let redirect = page.data.redirect;
|
||||
for link_idx in data.link_range(page_idx) {
|
||||
let link = data.link(link_idx);
|
||||
|
||||
let next = Entry {
|
||||
cost: cost + if redirect { 0 } else { link.data.cost },
|
||||
page_idx: link.to,
|
||||
};
|
||||
|
||||
let target_page = data.page_mut(link.to);
|
||||
if next.cost < target_page.data.cost {
|
||||
target_page.data.cost = next.cost;
|
||||
target_page.data.prev = page_idx;
|
||||
queue.push(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("> Collect results");
|
||||
let mut steps = vec![];
|
||||
let mut at = to;
|
||||
loop {
|
||||
steps.push(at);
|
||||
at = data.page(at).data.prev;
|
||||
if at == u32::MAX {
|
||||
break;
|
||||
};
|
||||
}
|
||||
steps.reverse();
|
||||
if steps.first() == Some(&from) {
|
||||
Some(steps)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
||||
println!(">> Import");
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
let pages = data.pages.clone();
|
||||
|
||||
println!(">> Locate from and to");
|
||||
let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from));
|
||||
let to_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, to));
|
||||
println!("From: {:?}", data.page(from_idx).data.title);
|
||||
println!("To: {:?}", data.page(to_idx).data.title);
|
||||
|
||||
println!(">> Find path");
|
||||
let path = dijkstra(data, from_idx, to_idx);
|
||||
|
||||
if let Some(path) = path {
|
||||
println!("Path found:");
|
||||
for page_idx in path {
|
||||
let page = &pages[page_idx as usize];
|
||||
if page.data.redirect {
|
||||
println!(" v {:?}", page.data.title);
|
||||
} else {
|
||||
println!(" - {:?}", page.data.title);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("No path found");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,273 +0,0 @@
|
|||
use std::{
|
||||
collections::{BTreeSet, HashMap, HashSet},
|
||||
io::{self, BufWriter},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
data::Data,
|
||||
graph::NodeIdx,
|
||||
util::{self, TitleNormalizer},
|
||||
};
|
||||
|
||||
struct PageMap(Vec<NodeIdx>);
|
||||
|
||||
impl PageMap {
|
||||
fn new(len: usize) -> Self {
|
||||
Self(vec![NodeIdx::NONE; len])
|
||||
}
|
||||
|
||||
fn get(&self, node: NodeIdx) -> NodeIdx {
|
||||
self.0[node.usize()]
|
||||
}
|
||||
|
||||
fn set(&mut self, node: NodeIdx, to: NodeIdx) {
|
||||
self.0[node.usize()] = to;
|
||||
}
|
||||
}
|
||||
|
||||
fn first_viable_link(data: &Data, node: NodeIdx) -> Option<NodeIdx> {
|
||||
for edge in data.graph.edge_slice(node) {
|
||||
let link = &data.links[edge.usize()];
|
||||
if !link.in_parens() && !link.in_structure() {
|
||||
return Some(*edge);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn find_forward_edges(data: &Data) -> PageMap {
|
||||
let mut result = PageMap::new(data.pages.len());
|
||||
for node in data.graph.nodes() {
|
||||
if let Some(first_link) = first_viable_link(data, node) {
|
||||
result.set(node, first_link);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn find_clusters(data: &Data, forward: &PageMap) -> PageMap {
|
||||
let mut cluster = PageMap::new(data.pages.len());
|
||||
for node in data.graph.nodes() {
|
||||
let mut current = node;
|
||||
let mut visited = HashSet::new();
|
||||
let canonical = loop {
|
||||
// We've already determined the canonical element for this page.
|
||||
if cluster.get(current) != NodeIdx::NONE {
|
||||
break cluster.get(current);
|
||||
}
|
||||
|
||||
// We've hit a loop
|
||||
if visited.contains(¤t) {
|
||||
let mut loop_members = BTreeSet::new();
|
||||
while !loop_members.contains(¤t) {
|
||||
loop_members.insert(current);
|
||||
current = forward.get(current);
|
||||
}
|
||||
break loop_members.pop_first().unwrap();
|
||||
}
|
||||
|
||||
visited.insert(current);
|
||||
|
||||
let next = forward.get(current);
|
||||
if next == NodeIdx::NONE {
|
||||
// We've hit a dead-end
|
||||
break current;
|
||||
}
|
||||
|
||||
current = next;
|
||||
};
|
||||
|
||||
for i in visited {
|
||||
cluster.set(i, canonical);
|
||||
}
|
||||
}
|
||||
|
||||
cluster
|
||||
}
|
||||
|
||||
enum Cluster {
|
||||
DeadEnd(NodeIdx),
|
||||
Loop(Vec<NodeIdx>),
|
||||
}
|
||||
|
||||
fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap<NodeIdx, Cluster> {
|
||||
let mut result = HashMap::new();
|
||||
for canonical in cluster.0.iter().copied().collect::<HashSet<_>>() {
|
||||
if forward.get(canonical) == NodeIdx::NONE {
|
||||
result.insert(canonical, Cluster::DeadEnd(canonical));
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut members = vec![];
|
||||
let mut current = canonical;
|
||||
loop {
|
||||
members.push(current);
|
||||
current = forward.get(current);
|
||||
if current == canonical {
|
||||
break;
|
||||
}
|
||||
}
|
||||
result.insert(canonical, Cluster::Loop(members));
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> {
|
||||
let map = forward
|
||||
.0
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(node, first_link)| {
|
||||
let page_title = &data.pages[node].title;
|
||||
let first_link_title = if *first_link == NodeIdx::NONE {
|
||||
None
|
||||
} else {
|
||||
Some(&data.pages[first_link.usize()].title)
|
||||
};
|
||||
(page_title, first_link_title)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let writer = BufWriter::new(io::stdout());
|
||||
serde_json::to_writer_pretty(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) {
|
||||
let start_idx = util::resolve_title(normalizer, data, start);
|
||||
|
||||
let mut current = start_idx;
|
||||
let mut visited = HashSet::new();
|
||||
loop {
|
||||
let page = &data.pages[current.usize()];
|
||||
let title = &page.title;
|
||||
if page.redirect {
|
||||
println!(" v {title}");
|
||||
} else {
|
||||
println!(" - {title}");
|
||||
}
|
||||
|
||||
visited.insert(current);
|
||||
|
||||
let next = forward.get(current);
|
||||
|
||||
if next == NodeIdx::NONE {
|
||||
println!("> dead-end reached");
|
||||
return;
|
||||
}
|
||||
|
||||
if visited.contains(&next) {
|
||||
let page = &data.pages[next.usize()];
|
||||
let title = &page.title;
|
||||
println!("> loop detected ({title})");
|
||||
return;
|
||||
}
|
||||
|
||||
current = next;
|
||||
}
|
||||
}
|
||||
|
||||
fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> {
|
||||
let map = cluster
|
||||
.0
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(page, canonical)| {
|
||||
(
|
||||
&data.pages[page].title,
|
||||
&data.pages[canonical.usize()].title,
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let writer = BufWriter::new(io::stdout());
|
||||
serde_json::to_writer_pretty(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, clap::Parser)]
|
||||
enum Command {
|
||||
First,
|
||||
Trace { start: String },
|
||||
Canonical,
|
||||
Cluster,
|
||||
}
|
||||
|
||||
/// Show interesting stats.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
let normalizer = TitleNormalizer::new();
|
||||
|
||||
eprintln!(">> Forward");
|
||||
let forward = find_forward_edges(&data);
|
||||
|
||||
match self.command {
|
||||
Command::First => {
|
||||
eprintln!(">> First links");
|
||||
print_forward_edges_as_json(&data, &forward)?;
|
||||
return Ok(());
|
||||
}
|
||||
Command::Trace { start } => {
|
||||
eprintln!(">> Tracing");
|
||||
print_trace(&normalizer, &data, &forward, &start);
|
||||
return Ok(());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Determine cluster for each page, represented via canonical page. The
|
||||
// canonical page of a cluster is either a dead-end or the loop member with
|
||||
// the smallest index.
|
||||
eprintln!(">> Find clusters");
|
||||
let cluster = find_clusters(&data, &forward);
|
||||
|
||||
if self.command == Command::Canonical {
|
||||
print_canonical_pages_as_json(&data, &cluster)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Measure cluster size
|
||||
eprintln!(">> Measure clusters");
|
||||
let mut cluster_size = HashMap::<NodeIdx, u32>::new();
|
||||
for (i, canonical) in cluster.0.iter().enumerate() {
|
||||
assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title);
|
||||
*cluster_size.entry(*canonical).or_default() += 1;
|
||||
}
|
||||
let mut cluster_by_size = cluster_size.into_iter().collect::<Vec<_>>();
|
||||
cluster_by_size.sort_by_key(|(c, s)| (*s, *c));
|
||||
cluster_by_size.reverse();
|
||||
|
||||
// Print clusters
|
||||
assert!(self.command == Command::Cluster);
|
||||
let resolved = resolve_clusters(&forward, &cluster);
|
||||
for (canonical, size) in cluster_by_size {
|
||||
match resolved.get(&canonical).unwrap() {
|
||||
Cluster::DeadEnd(page) => {
|
||||
let title = &data.pages[page.usize()].title;
|
||||
println!("Cluster (dead-end, {size}): {title}");
|
||||
}
|
||||
Cluster::Loop(pages) => {
|
||||
println!("Cluster ({}-loop, {size}):", pages.len());
|
||||
for page in pages {
|
||||
let page = &data.pages[page.usize()];
|
||||
let title = &page.title;
|
||||
if page.redirect {
|
||||
println!(" v {title}");
|
||||
} else {
|
||||
println!(" - {title}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
267
brood/src/commands/philosophy_game.rs
Normal file
267
brood/src/commands/philosophy_game.rs
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
use std::{
|
||||
collections::{BTreeSet, HashMap, HashSet},
|
||||
fs::File,
|
||||
io::{self, BufReader, BufWriter},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
data::{
|
||||
adjacency_list::AdjacencyList,
|
||||
info::{LinkInfo, PageInfo},
|
||||
store,
|
||||
},
|
||||
util, PhilosophyGameCmd,
|
||||
};
|
||||
|
||||
struct PageMap(Vec<u32>);
|
||||
|
||||
impl PageMap {
|
||||
fn new(len: usize) -> Self {
|
||||
Self(vec![u32::MAX; len])
|
||||
}
|
||||
|
||||
fn get(&self, page_idx: u32) -> u32 {
|
||||
self.0[page_idx as usize]
|
||||
}
|
||||
|
||||
fn set(&mut self, page_idx: u32, to: u32) {
|
||||
self.0[page_idx as usize] = to;
|
||||
}
|
||||
}
|
||||
|
||||
fn first_viable_link(data: &AdjacencyList<PageInfo, LinkInfo>, page_idx: u32) -> Option<u32> {
|
||||
for link_idx in data.link_range(page_idx) {
|
||||
let link = data.link(link_idx);
|
||||
if !link.data.in_parens() && !link.data.in_structure() {
|
||||
return Some(link.to);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn find_forward_edges(data: &AdjacencyList<PageInfo, LinkInfo>) -> PageMap {
|
||||
let mut result = PageMap::new(data.pages.len());
|
||||
for (page_idx, _) in data.pages() {
|
||||
if let Some(first_link) = first_viable_link(data, page_idx) {
|
||||
result.set(page_idx, first_link);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn find_clusters(data: &AdjacencyList<PageInfo, LinkInfo>, forward: &PageMap) -> PageMap {
|
||||
let mut cluster = PageMap::new(data.pages.len());
|
||||
for (page_idx, _) in data.pages() {
|
||||
let mut current = page_idx;
|
||||
let mut visited = HashSet::new();
|
||||
let canonical = loop {
|
||||
// We've already determined the canonical element for this page.
|
||||
if cluster.get(current) != u32::MAX {
|
||||
break cluster.get(current);
|
||||
}
|
||||
|
||||
// We've hit a loop
|
||||
if visited.contains(¤t) {
|
||||
let mut loop_members = BTreeSet::new();
|
||||
while !loop_members.contains(¤t) {
|
||||
loop_members.insert(current);
|
||||
current = forward.get(current);
|
||||
}
|
||||
break loop_members.pop_first().unwrap();
|
||||
}
|
||||
|
||||
visited.insert(current);
|
||||
|
||||
let next = forward.get(current);
|
||||
if next == u32::MAX {
|
||||
// We've hit a dead-end
|
||||
break current;
|
||||
}
|
||||
|
||||
current = next;
|
||||
};
|
||||
|
||||
for i in visited {
|
||||
cluster.set(i, canonical);
|
||||
}
|
||||
}
|
||||
|
||||
cluster
|
||||
}
|
||||
|
||||
enum Cluster {
|
||||
DeadEnd(u32),
|
||||
Loop(Vec<u32>),
|
||||
}
|
||||
|
||||
fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap<u32, Cluster> {
|
||||
let mut result = HashMap::new();
|
||||
for canonical in cluster.0.iter().copied().collect::<HashSet<_>>() {
|
||||
if forward.get(canonical) == u32::MAX {
|
||||
result.insert(canonical, Cluster::DeadEnd(canonical));
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut members = vec![];
|
||||
let mut current = canonical;
|
||||
loop {
|
||||
members.push(current);
|
||||
current = forward.get(current);
|
||||
if current == canonical {
|
||||
break;
|
||||
}
|
||||
}
|
||||
result.insert(canonical, Cluster::Loop(members));
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn print_forward_edges_as_json(
|
||||
data: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
forward: &PageMap,
|
||||
) -> io::Result<()> {
|
||||
let map = forward
|
||||
.0
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(page, first_link)| {
|
||||
let page_title = &data.page(page as u32).data.title;
|
||||
let first_link_title = if *first_link == u32::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(&data.page(*first_link).data.title)
|
||||
};
|
||||
(page_title, first_link_title)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let writer = BufWriter::new(io::stdout());
|
||||
serde_json::to_writer_pretty(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_trace(data: &AdjacencyList<PageInfo, LinkInfo>, forward: &PageMap, start: &str) {
|
||||
let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start));
|
||||
|
||||
let mut current = start_idx;
|
||||
let mut visited = HashSet::new();
|
||||
loop {
|
||||
let page = data.page(current);
|
||||
let title = &page.data.title;
|
||||
if page.data.redirect {
|
||||
println!(" v {title}");
|
||||
} else {
|
||||
println!(" - {title}");
|
||||
}
|
||||
|
||||
visited.insert(current);
|
||||
|
||||
let next = forward.get(current);
|
||||
|
||||
if next == u32::MAX {
|
||||
println!("dead-end reached");
|
||||
return;
|
||||
}
|
||||
|
||||
if visited.contains(&next) {
|
||||
println!("loop detected");
|
||||
return;
|
||||
}
|
||||
|
||||
current = next;
|
||||
}
|
||||
}
|
||||
|
||||
fn print_canonical_pages_as_json(
|
||||
data: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
cluster: &PageMap,
|
||||
) -> io::Result<()> {
|
||||
let map = cluster
|
||||
.0
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(page, canonical)| {
|
||||
(
|
||||
&data.page(page as u32).data.title,
|
||||
&data.page(*canonical).data.title,
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let writer = BufWriter::new(io::stdout());
|
||||
serde_json::to_writer_pretty(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> {
|
||||
eprintln!(">> Import");
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
|
||||
eprintln!(">> Forward");
|
||||
let forward = find_forward_edges(&data);
|
||||
|
||||
match subcmd {
|
||||
PhilosophyGameCmd::First => {
|
||||
eprintln!(">> First links");
|
||||
print_forward_edges_as_json(&data, &forward)?;
|
||||
return Ok(());
|
||||
}
|
||||
PhilosophyGameCmd::Trace { start } => {
|
||||
eprintln!(">> Tracing");
|
||||
print_trace(&data, &forward, &start);
|
||||
return Ok(());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Determine cluster for each page, represented via canonical page. The
|
||||
// canonical page of a cluster is either a dead-end or the loop member with
|
||||
// the smallest index.
|
||||
eprintln!(">> Find clusters");
|
||||
let cluster = find_clusters(&data, &forward);
|
||||
|
||||
if subcmd == PhilosophyGameCmd::Canonical {
|
||||
print_canonical_pages_as_json(&data, &cluster)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Measure cluster size
|
||||
eprintln!(">> Measure clusters");
|
||||
let mut cluster_size = HashMap::<u32, u32>::new();
|
||||
for (i, canonical) in cluster.0.iter().enumerate() {
|
||||
assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title);
|
||||
*cluster_size.entry(*canonical).or_default() += 1;
|
||||
}
|
||||
let mut cluster_by_size = cluster_size.into_iter().collect::<Vec<_>>();
|
||||
cluster_by_size.sort_by_key(|(c, s)| (*s, *c));
|
||||
cluster_by_size.reverse();
|
||||
|
||||
// Print clusters
|
||||
assert!(subcmd == PhilosophyGameCmd::Cluster);
|
||||
let resolved = resolve_clusters(&forward, &cluster);
|
||||
for (canonical, size) in cluster_by_size {
|
||||
match resolved.get(&canonical).unwrap() {
|
||||
Cluster::DeadEnd(page) => {
|
||||
let title = &data.page(*page).data.title;
|
||||
println!("Cluster (dead-end, {size}): {title}");
|
||||
}
|
||||
Cluster::Loop(pages) => {
|
||||
println!("Cluster ({}-loop, {size}):", pages.len());
|
||||
for page in pages {
|
||||
let page = data.page(*page);
|
||||
let title = &page.data.title;
|
||||
if page.data.redirect {
|
||||
println!(" v {title}");
|
||||
} else {
|
||||
println!(" - {title}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
104
brood/src/commands/reexport.rs
Normal file
104
brood/src/commands/reexport.rs
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::data::adjacency_list::AdjacencyList;
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct FilterFile {
|
||||
title: String,
|
||||
language: String,
|
||||
}
|
||||
|
||||
fn filter_pages(
|
||||
data: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
keep: HashSet<String>,
|
||||
) -> AdjacencyList<PageInfo, LinkInfo> {
|
||||
// Map from old to new indices. Only contains entries for pages to keep.
|
||||
let mut index_map = HashMap::new();
|
||||
for (page_idx, page) in data.pages() {
|
||||
if keep.contains(&util::normalize_link(&page.data.title)) {
|
||||
index_map.insert(page_idx, index_map.len() as u32);
|
||||
}
|
||||
}
|
||||
|
||||
// Create new adjacency list in a single pass
|
||||
let mut result = AdjacencyList::default();
|
||||
for (page_idx, page) in data.pages() {
|
||||
let Some(new_idx) = index_map.get(&page_idx) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let actual_new_idx = result.push_page(page.data.clone());
|
||||
assert!(*new_idx == actual_new_idx);
|
||||
|
||||
for (_, link) in data.links(page_idx) {
|
||||
if let Some(to) = index_map.get(&link.to) {
|
||||
result.push_link(*to, link.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn reexport(
|
||||
from: &Path,
|
||||
to: &Path,
|
||||
in_parens: Option<bool>,
|
||||
in_structure: Option<bool>,
|
||||
filter: Option<PathBuf>,
|
||||
) -> io::Result<()> {
|
||||
eprintln!(">> Import");
|
||||
let mut from = BufReader::new(File::open(from)?);
|
||||
let mut data = store::read_adjacency_list(&mut from)?;
|
||||
|
||||
eprintln!(">> Consistency check");
|
||||
data.check_consistency();
|
||||
|
||||
if in_parens.is_some() || in_structure.is_some() || filter.is_some() {
|
||||
eprintln!(">> Filtering");
|
||||
|
||||
let mut data2 = AdjacencyList::default();
|
||||
for (page_idx, page) in data.pages() {
|
||||
data2.push_page(page.data.clone());
|
||||
for (_, link) in data.links(page_idx) {
|
||||
if in_parens.is_some_and(|v| v != link.data.in_parens()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if in_structure.is_some_and(|v| v != link.data.in_structure()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data2.push_link(link.to, link.data);
|
||||
}
|
||||
}
|
||||
|
||||
data = data2;
|
||||
|
||||
if let Some(filter) = filter {
|
||||
let filter = fs::read_to_string(filter)?;
|
||||
let filter = serde_json::from_str::<Vec<FilterFile>>(&filter).unwrap();
|
||||
let keep = filter
|
||||
.into_iter()
|
||||
.filter(|f| f.language == "en")
|
||||
.map(|f| f.title)
|
||||
.map(|t| util::normalize_link(&t))
|
||||
.collect::<HashSet<_>>();
|
||||
data = filter_pages(&data, keep);
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!(">> Export");
|
||||
let mut to = BufWriter::new(File::create(to)?);
|
||||
store::write_adjacency_list(&data, &mut to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1,151 +0,0 @@
|
|||
use std::{collections::HashSet, io};
|
||||
|
||||
use thousands::Separable;
|
||||
|
||||
use crate::{
|
||||
data::Data,
|
||||
util::{self, TitleNormalizer},
|
||||
};
|
||||
|
||||
/// Show info about a specific article.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
title: String,
|
||||
|
||||
/// Print links in more detail.
|
||||
#[arg(long, short)]
|
||||
links: bool,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
let normalizer = TitleNormalizer::new();
|
||||
|
||||
println!(">> Locate article");
|
||||
let mut node = util::locate_title(&normalizer, &data, &self.title);
|
||||
|
||||
loop {
|
||||
let page = &data.pages[node.usize()];
|
||||
|
||||
const W_LABEL: usize = 12;
|
||||
const W_NUM: usize = 11;
|
||||
|
||||
println!();
|
||||
|
||||
println!("{:>W_LABEL$}: {}", "Title", page.title);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {}",
|
||||
"Title (norm)",
|
||||
normalizer.normalize(&page.title)
|
||||
);
|
||||
|
||||
println!("{:>W_LABEL$}: {}", "Redirect", page.redirect);
|
||||
|
||||
println!("{:>W_LABEL$}: {:>W_NUM$}", "ID", page.id);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Length",
|
||||
page.length.separate_with_underscores()
|
||||
);
|
||||
|
||||
let outlinks = data.graph.edge_slice(node).to_vec();
|
||||
let inlinks = data
|
||||
.graph
|
||||
.edges()
|
||||
.filter(|(_, target)| *target == node)
|
||||
.map(|(source, _)| source)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let outlinks_set = outlinks.iter().copied().collect::<HashSet<_>>();
|
||||
let inlinks_set = inlinks.iter().copied().collect::<HashSet<_>>();
|
||||
let twins_set = outlinks_set
|
||||
.intersection(&inlinks_set)
|
||||
.copied()
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Links (out)",
|
||||
outlinks.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"unique",
|
||||
outlinks_set.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Links (in)",
|
||||
inlinks.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"unique",
|
||||
inlinks_set.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Twins",
|
||||
twins_set.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
if self.links {
|
||||
let mut twin_pages = twins_set
|
||||
.iter()
|
||||
.map(|n| &data.pages[n.usize()])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut outlink_only_pages = outlinks_set
|
||||
.difference(&twins_set)
|
||||
.map(|n| &data.pages[n.usize()])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut inlink_only_pages = inlinks_set
|
||||
.difference(&twins_set)
|
||||
.map(|n| &data.pages[n.usize()])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
twin_pages.sort_by_key(|p| &p.title);
|
||||
outlink_only_pages.sort_by_key(|p| &p.title);
|
||||
inlink_only_pages.sort_by_key(|p| &p.title);
|
||||
|
||||
println!();
|
||||
println!("Twins ({}):", twin_pages.len().separate_with_underscores());
|
||||
for page in twin_pages {
|
||||
println!("{}", util::fmt_page(page));
|
||||
}
|
||||
|
||||
println!();
|
||||
println!(
|
||||
"Only outlinks ({}):",
|
||||
outlink_only_pages.len().separate_with_underscores()
|
||||
);
|
||||
for page in outlink_only_pages {
|
||||
println!("{}", util::fmt_page(page));
|
||||
}
|
||||
|
||||
println!();
|
||||
println!(
|
||||
"Only inlinks ({}):",
|
||||
inlink_only_pages.len().separate_with_underscores()
|
||||
);
|
||||
for page in inlink_only_pages {
|
||||
println!("{}", util::fmt_page(page));
|
||||
}
|
||||
}
|
||||
|
||||
node = match data.redirect_target(node) {
|
||||
Some(target) => target,
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
mod degrees;
|
||||
mod redirects;
|
||||
|
||||
use std::io;
|
||||
|
||||
use thousands::Separable;
|
||||
|
||||
use crate::data::Data;
|
||||
|
||||
#[derive(Debug, clap::Parser)]
|
||||
enum Command {
|
||||
Degrees(degrees::Cmd),
|
||||
Redirects(redirects::Cmd),
|
||||
}
|
||||
|
||||
/// Show interesting stats.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
#[command(subcommand)]
|
||||
command: Option<Command>,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
if let Some(cmd) = self.command {
|
||||
return match cmd {
|
||||
Command::Degrees(cmd) => cmd.run(data),
|
||||
Command::Redirects(cmd) => cmd.run(data),
|
||||
};
|
||||
}
|
||||
|
||||
println!();
|
||||
|
||||
const W_LABEL: usize = 14;
|
||||
const W_NUM: usize = 11;
|
||||
|
||||
let n_pages = data.pages.len();
|
||||
let n_redirects = data.pages.iter().filter(|p| p.redirect).count();
|
||||
let n_articles = n_pages - n_redirects;
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Pages",
|
||||
n_pages.separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Articles",
|
||||
n_articles.separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Redirects",
|
||||
n_redirects.separate_with_underscores()
|
||||
);
|
||||
|
||||
println!();
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"Links",
|
||||
data.links.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"in parens",
|
||||
data.links
|
||||
.iter()
|
||||
.filter(|l| l.in_parens())
|
||||
.count()
|
||||
.separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"in structures",
|
||||
data.links
|
||||
.iter()
|
||||
.filter(|l| l.in_structure())
|
||||
.count()
|
||||
.separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"{:>W_LABEL$}: {:>W_NUM$}",
|
||||
"pg eligible",
|
||||
data.links
|
||||
.iter()
|
||||
.filter(|l| !l.in_parens() && !l.in_structure())
|
||||
.count()
|
||||
.separate_with_underscores()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
use std::{cmp::Reverse, io};
|
||||
|
||||
use thousands::Separable;
|
||||
|
||||
use crate::{
|
||||
algo,
|
||||
data::{Data, Page},
|
||||
util,
|
||||
};
|
||||
|
||||
/// Show stats on article in- and out-degrees.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
#[arg(long, short, default_value_t = 5)]
|
||||
top: usize,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, mut data: Data) -> io::Result<()> {
|
||||
println!(">> Outdegree");
|
||||
println!("> Counting links");
|
||||
let mut outdegree = vec![usize::MAX; data.pages.len()];
|
||||
for node in data.graph.nodes() {
|
||||
outdegree[node.usize()] = data.graph.edge_range(node).len();
|
||||
}
|
||||
|
||||
println!(">> Indegree");
|
||||
println!("> Inverting edges");
|
||||
algo::invert(&mut data);
|
||||
let mut indegree = vec![usize::MAX; data.pages.len()];
|
||||
println!("> Counting links");
|
||||
for node in data.graph.nodes() {
|
||||
indegree[node.usize()] = data.graph.edge_range(node).len();
|
||||
}
|
||||
|
||||
let mut by_degrees = data
|
||||
.pages
|
||||
.iter()
|
||||
.zip(outdegree)
|
||||
.zip(indegree)
|
||||
.map(|((p, od), id)| (p, od, id))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!();
|
||||
println!("Most outlinks");
|
||||
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯");
|
||||
|
||||
by_degrees.sort_by_key(|(_, od, _)| Reverse(*od));
|
||||
self.print_links(&by_degrees);
|
||||
|
||||
println!();
|
||||
println!("Most inlinks");
|
||||
println!("¯¯¯¯¯¯¯¯¯¯¯¯");
|
||||
|
||||
by_degrees.sort_by_key(|(_, _, id)| Reverse(*id));
|
||||
self.print_links(&by_degrees);
|
||||
|
||||
by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0);
|
||||
|
||||
println!();
|
||||
println!("Most outlinks per non-zero inlink");
|
||||
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
|
||||
|
||||
by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| {
|
||||
let r1 = *od1 as f32 / *id1 as f32;
|
||||
let r2 = *od2 as f32 / *id2 as f32;
|
||||
r2.total_cmp(&r1) // Reverse order so max values are at beginnibg
|
||||
});
|
||||
self.print_links(&by_degrees);
|
||||
|
||||
println!();
|
||||
println!("Most inlinks per non-zero outlink");
|
||||
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
|
||||
|
||||
by_degrees.reverse();
|
||||
self.print_links(&by_degrees);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) {
|
||||
for (i, (page, od, id)) in by_degrees.iter().take(self.top).enumerate() {
|
||||
println!(
|
||||
"{:3}. {} ({} out, {} in)",
|
||||
i + 1,
|
||||
util::fmt_page(page),
|
||||
od.separate_with_underscores(),
|
||||
id.separate_with_underscores()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
use std::{cmp::Reverse, collections::HashSet, io};
|
||||
|
||||
use thousands::Separable;
|
||||
|
||||
use crate::{data::Data, graph::NodeIdx, util};
|
||||
|
||||
fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> {
|
||||
let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new();
|
||||
|
||||
for node in data.graph.nodes() {
|
||||
if !data.pages[node.usize()].redirect {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut seen = HashSet::new();
|
||||
|
||||
let mut curr = node;
|
||||
seen.insert(node);
|
||||
|
||||
while let Some(next) = data.redirect_target(curr) {
|
||||
if seen.contains(&next) {
|
||||
println!(" Redirect loop: {}", data.pages[node.usize()].title);
|
||||
break;
|
||||
}
|
||||
|
||||
curr = next;
|
||||
seen.insert(next);
|
||||
}
|
||||
|
||||
redirects.push((node, curr, seen.len() - 1));
|
||||
}
|
||||
|
||||
redirects
|
||||
}
|
||||
|
||||
fn follow_redirect(data: &Data, start: NodeIdx) -> Vec<NodeIdx> {
|
||||
let mut seen = HashSet::new();
|
||||
let mut nodes = Vec::new();
|
||||
|
||||
let mut curr = start;
|
||||
seen.insert(curr);
|
||||
nodes.push(curr);
|
||||
|
||||
while let Some(next) = data.redirect_target(curr) {
|
||||
if seen.contains(&next) {
|
||||
break;
|
||||
}
|
||||
|
||||
curr = next;
|
||||
seen.insert(curr);
|
||||
nodes.push(curr);
|
||||
}
|
||||
|
||||
nodes
|
||||
}
|
||||
|
||||
/// Show redirect stats.
|
||||
#[derive(Debug, clap::Parser)]
|
||||
pub struct Cmd {
|
||||
/// Show more detailed info.
|
||||
#[arg(long, short)]
|
||||
long: bool,
|
||||
}
|
||||
|
||||
impl Cmd {
|
||||
pub fn run(self, data: Data) -> io::Result<()> {
|
||||
println!(">> Resolve redirects");
|
||||
let redirects = find_redirects(&data);
|
||||
|
||||
println!(
|
||||
"There is a total of {} redirects.",
|
||||
redirects.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
let mut long = redirects
|
||||
.iter()
|
||||
.filter(|(_, _, l)| *l > 1)
|
||||
.collect::<Vec<_>>();
|
||||
long.sort_by_key(|(_, _, l)| Reverse(l));
|
||||
|
||||
println!(
|
||||
"{} redirects take more than one step to reach an article.",
|
||||
long.len().separate_with_underscores()
|
||||
);
|
||||
|
||||
println!(
|
||||
"The longest redirect chain takes {} steps.",
|
||||
long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0),
|
||||
);
|
||||
|
||||
println!("Though these redirect chains are usually swiftly fixed by bots.");
|
||||
|
||||
if self.long {
|
||||
println!();
|
||||
println!("Redirect chains with length > 1:");
|
||||
|
||||
for (start, _, _) in long {
|
||||
println!();
|
||||
for step in follow_redirect(&data, *start) {
|
||||
println!("{}", util::fmt_page(&data.pages[step.usize()]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,218 +1,3 @@
|
|||
use std::{
|
||||
fs::File,
|
||||
io::{self, BufReader, BufWriter, Read, Write},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use crate::graph::{EdgeIdx, Graph, NodeIdx};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Page {
|
||||
pub id: u32,
|
||||
pub title: String,
|
||||
pub length: u32,
|
||||
pub redirect: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct Link {
|
||||
pub start: u32,
|
||||
pub len: u32,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl Link {
|
||||
pub fn in_parens(self) -> bool {
|
||||
self.flags & 0b1 != 0
|
||||
}
|
||||
|
||||
pub fn in_structure(self) -> bool {
|
||||
self.flags & 0b10 != 0
|
||||
}
|
||||
}
|
||||
|
||||
fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> {
|
||||
w.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u8(r: &mut impl Read) -> io::Result<u8> {
|
||||
let mut buf = [0_u8; 1];
|
||||
r.read_exact(&mut buf)?;
|
||||
Ok(u8::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> {
|
||||
w.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u16(r: &mut impl Read) -> io::Result<u16> {
|
||||
let mut buf = [0_u8; 2];
|
||||
r.read_exact(&mut buf)?;
|
||||
Ok(u16::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> {
|
||||
w.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u32(r: &mut impl Read) -> io::Result<u32> {
|
||||
let mut buf = [0_u8; 4];
|
||||
r.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> {
|
||||
assert!(s.len() <= u16::MAX as usize);
|
||||
write_u16(w, s.len() as u16)?;
|
||||
w.write_all(s.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_str(r: &mut impl Read) -> io::Result<String> {
|
||||
let len = read_u16(r)? as usize;
|
||||
let mut buf = vec![0_u8; len];
|
||||
r.read_exact(&mut buf)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
|
||||
fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> {
|
||||
write_u32(w, page.id)?;
|
||||
write_u32(w, page.length)?;
|
||||
write_u8(w, if page.redirect { 1 } else { 0 })?;
|
||||
write_str(w, &page.title)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_page(r: &mut impl Read) -> io::Result<Page> {
|
||||
Ok(Page {
|
||||
id: read_u32(r)?,
|
||||
length: read_u32(r)?,
|
||||
redirect: read_u8(r)? != 0,
|
||||
title: read_str(r)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> {
|
||||
write_u32(w, link.start)?;
|
||||
write_u32(w, link.len)?;
|
||||
write_u8(w, link.flags)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_link(r: &mut impl Read) -> io::Result<Link> {
|
||||
Ok(Link {
|
||||
start: read_u32(r)?,
|
||||
len: read_u32(r)?,
|
||||
flags: read_u8(r)?,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Data {
|
||||
pub pages: Vec<Page>,
|
||||
pub links: Vec<Link>,
|
||||
pub graph: Graph,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn with_capacity(pages: usize, links: usize) -> Self {
|
||||
Self {
|
||||
pages: Vec::with_capacity(pages),
|
||||
links: Vec::with_capacity(links),
|
||||
graph: Graph::with_capacity(pages, links),
|
||||
}
|
||||
}
|
||||
|
||||
fn write(&self, w: &mut impl Write) -> io::Result<()> {
|
||||
assert!(self.pages.len() < u32::MAX as usize);
|
||||
assert!(self.links.len() < u32::MAX as usize);
|
||||
assert_eq!(self.pages.len(), self.graph.nodes.len());
|
||||
assert_eq!(self.links.len(), self.graph.edges.len());
|
||||
write_u32(w, self.pages.len() as u32)?;
|
||||
write_u32(w, self.links.len() as u32)?;
|
||||
|
||||
for page in &self.pages {
|
||||
write_page(w, page)?;
|
||||
}
|
||||
|
||||
for link in &self.links {
|
||||
write_link(w, link)?;
|
||||
}
|
||||
|
||||
for node in &self.graph.nodes {
|
||||
write_u32(w, node.0)?;
|
||||
}
|
||||
|
||||
for edge in &self.graph.edges {
|
||||
write_u32(w, edge.0)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read(r: &mut impl Read) -> io::Result<Self> {
|
||||
let n_pages = read_u32(r)?;
|
||||
let n_links = read_u32(r)?;
|
||||
|
||||
let mut result = Self::with_capacity(n_pages as usize, n_links as usize);
|
||||
|
||||
for _ in 0..n_pages {
|
||||
result.pages.push(read_page(r)?);
|
||||
}
|
||||
|
||||
for _ in 0..n_links {
|
||||
result.links.push(read_link(r)?);
|
||||
}
|
||||
|
||||
for _ in 0..n_pages {
|
||||
result.graph.nodes.push(EdgeIdx(read_u32(r)?));
|
||||
}
|
||||
|
||||
for _ in 0..n_links {
|
||||
result.graph.edges.push(NodeIdx(read_u32(r)?));
|
||||
}
|
||||
|
||||
assert_eq!(result.pages.len(), result.graph.nodes.len());
|
||||
assert_eq!(result.links.len(), result.graph.edges.len());
|
||||
result.graph.check_consistency();
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn write_to_file(&self, path: &Path) -> io::Result<()> {
|
||||
let mut file = BufWriter::new(File::create(path)?);
|
||||
self.write(&mut file)
|
||||
}
|
||||
|
||||
pub fn read_from_file(path: &Path) -> io::Result<Self> {
|
||||
let mut file = BufReader::new(File::open(path)?);
|
||||
Self::read(&mut file)
|
||||
}
|
||||
|
||||
pub fn check_consistency(&self) {
|
||||
assert_eq!(
|
||||
self.pages.len(),
|
||||
self.graph.nodes.len(),
|
||||
"inconsistent number of pages"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
self.links.len(),
|
||||
self.graph.edges.len(),
|
||||
"inconsistent number of links"
|
||||
);
|
||||
|
||||
self.graph.check_consistency();
|
||||
}
|
||||
|
||||
pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> {
|
||||
if !self.pages[node.usize()].redirect {
|
||||
return None;
|
||||
}
|
||||
|
||||
self.graph.edge_slice(node).first().copied()
|
||||
}
|
||||
}
|
||||
pub mod adjacency_list;
|
||||
pub mod info;
|
||||
pub mod store;
|
||||
|
|
|
|||
162
brood/src/data/adjacency_list.rs
Normal file
162
brood/src/data/adjacency_list.rs
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
use std::ops::Range;
|
||||
|
||||
use super::info::{LinkInfo, PageInfo};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Page<P> {
|
||||
/// Index of the first link belonging to this page.
|
||||
pub start: u32,
|
||||
pub data: P,
|
||||
}
|
||||
|
||||
impl<P> Page<P> {
|
||||
pub fn change_data<P2>(self, f: impl Fn(P) -> P2) -> Page<P2> {
|
||||
Page {
|
||||
start: self.start,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Link<L> {
|
||||
/// Index of the page this link points to.
|
||||
pub to: u32,
|
||||
pub data: L,
|
||||
}
|
||||
|
||||
impl<L> Link<L> {
|
||||
pub fn change_data<L2>(self, f: impl Fn(L) -> L2) -> Link<L2> {
|
||||
Link {
|
||||
to: self.to,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AdjacencyList<P, L> {
|
||||
pub pages: Vec<Page<P>>,
|
||||
pub links: Vec<Link<L>>,
|
||||
}
|
||||
|
||||
impl<P, L> Default for AdjacencyList<P, L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pages: Default::default(),
|
||||
links: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<P, L> AdjacencyList<P, L> {
|
||||
pub fn push_page(&mut self, data: P) -> u32 {
|
||||
self.pages.push(Page {
|
||||
start: self.links.len() as u32,
|
||||
data,
|
||||
});
|
||||
self.pages.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn push_link(&mut self, to: u32, data: L) -> u32 {
|
||||
self.links.push(Link { to, data });
|
||||
self.links.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn page(&self, page_idx: u32) -> &Page<P> {
|
||||
&self.pages[page_idx as usize]
|
||||
}
|
||||
|
||||
pub fn page_mut(&mut self, page_idx: u32) -> &mut Page<P> {
|
||||
&mut self.pages[page_idx as usize]
|
||||
}
|
||||
|
||||
pub fn pages(&self) -> impl Iterator<Item = (u32, &Page<P>)> {
|
||||
self.pages.iter().enumerate().map(|(i, p)| (i as u32, p))
|
||||
}
|
||||
|
||||
pub fn link(&self, link_idx: u32) -> &Link<L> {
|
||||
&self.links[link_idx as usize]
|
||||
}
|
||||
|
||||
pub fn link_mut(&mut self, link_idx: u32) -> &mut Link<L> {
|
||||
&mut self.links[link_idx as usize]
|
||||
}
|
||||
|
||||
pub fn link_range(&self, page_idx: u32) -> Range<u32> {
|
||||
let start_idx = self.pages[page_idx as usize].start;
|
||||
let end_idx = match self.pages.get(page_idx as usize + 1) {
|
||||
Some(page) => page.start,
|
||||
None => self.links.len() as u32,
|
||||
};
|
||||
start_idx..end_idx
|
||||
}
|
||||
|
||||
pub fn link_redirect(&self, page_idx: u32) -> Option<u32> {
|
||||
let range = self.link_range(page_idx);
|
||||
if range.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(range.start)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn links(&self, page_idx: u32) -> impl Iterator<Item = (u32, &Link<L>)> {
|
||||
self.link_range(page_idx).map(|i| (i, self.link(i)))
|
||||
}
|
||||
|
||||
pub fn change_page_data<P2>(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList<P2, L> {
|
||||
let pages = self
|
||||
.pages
|
||||
.into_iter()
|
||||
.map(|p| p.change_data(page_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages,
|
||||
links: self.links,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn change_link_data<L2>(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList<P, L2> {
|
||||
let links = self
|
||||
.links
|
||||
.into_iter()
|
||||
.map(|l| l.change_data(link_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages: self.pages,
|
||||
links,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AdjacencyList<PageInfo, LinkInfo> {
|
||||
pub fn check_consistency(&self) {
|
||||
// Check that all types are large enough
|
||||
assert!(self.pages.len() < u32::MAX as usize, "too many pages");
|
||||
assert!(self.links.len() < u32::MAX as usize, "too many links");
|
||||
for page in &self.pages {
|
||||
assert!(
|
||||
page.data.title.len() <= u8::MAX as usize,
|
||||
"page title too long"
|
||||
);
|
||||
}
|
||||
|
||||
// Check that all links contain valid indices. Links must not link to
|
||||
// the sentinel page.
|
||||
let range = 0..self.pages.len() as u32;
|
||||
for link in &self.links {
|
||||
assert!(range.contains(&link.to), "invalid link");
|
||||
}
|
||||
|
||||
// Check that all redirect pages have at most one link
|
||||
for (page_idx, page) in self.pages.iter().enumerate() {
|
||||
if page.data.redirect {
|
||||
let range = self.link_range(page_idx as u32);
|
||||
let amount = range.end - range.start;
|
||||
assert!(amount <= 1, "too many redirect links");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
24
brood/src/data/info.rs
Normal file
24
brood/src/data/info.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#[derive(Debug, Clone)]
|
||||
pub struct PageInfo {
|
||||
pub id: u32,
|
||||
pub title: String,
|
||||
pub length: u32,
|
||||
pub redirect: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct LinkInfo {
|
||||
pub start: u32,
|
||||
pub len: u32,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl LinkInfo {
|
||||
pub fn in_parens(self) -> bool {
|
||||
self.flags & 0b1 != 0
|
||||
}
|
||||
|
||||
pub fn in_structure(self) -> bool {
|
||||
self.flags & 0b10 != 0
|
||||
}
|
||||
}
|
||||
134
brood/src/data/store.rs
Normal file
134
brood/src/data/store.rs
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
use std::io::{self, Read, Write};
|
||||
|
||||
use super::{
|
||||
adjacency_list::{AdjacencyList, Link, Page},
|
||||
info::{LinkInfo, PageInfo},
|
||||
};
|
||||
|
||||
fn write_u8<W: Write>(n: u8, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u8<R: Read>(from: &mut R) -> io::Result<u8> {
|
||||
let mut buf = [0_u8; 1];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u8::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u16<W: Write>(n: u16, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u16<R: Read>(from: &mut R) -> io::Result<u16> {
|
||||
let mut buf = [0_u8; 2];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u16::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u32<W: Write>(n: u32, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u32<R: Read>(from: &mut R) -> io::Result<u32> {
|
||||
let mut buf = [0_u8; 4];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_str<W: Write>(s: &str, to: &mut W) -> io::Result<()> {
|
||||
assert!(s.len() <= u16::MAX as usize);
|
||||
write_u16(s.len() as u16, to)?;
|
||||
to.write_all(s.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_str<R: Read>(from: &mut R) -> io::Result<String> {
|
||||
let len = read_u16(from)? as usize;
|
||||
let mut buf = vec![0_u8; len];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
|
||||
fn write_page<W: Write>(page: &Page<PageInfo>, to: &mut W) -> io::Result<()> {
|
||||
write_u32(page.start, to)?;
|
||||
write_u32(page.data.id, to)?;
|
||||
write_u32(page.data.length, to)?;
|
||||
write_u8(if page.data.redirect { 1 } else { 0 }, to)?;
|
||||
write_str(&page.data.title, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_page<R: Read>(from: &mut R) -> io::Result<Page<PageInfo>> {
|
||||
let start_link_idx = read_u32(from)?;
|
||||
let id = read_u32(from)?;
|
||||
let length = read_u32(from)?;
|
||||
let redirect = read_u8(from)? != 0;
|
||||
let title = read_str(from)?;
|
||||
|
||||
Ok(Page {
|
||||
start: start_link_idx,
|
||||
data: PageInfo {
|
||||
id,
|
||||
length,
|
||||
redirect,
|
||||
title,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
fn write_link<W: Write>(link: &Link<LinkInfo>, to: &mut W) -> io::Result<()> {
|
||||
write_u32(link.to, to)?;
|
||||
write_u32(link.data.start, to)?;
|
||||
write_u32(link.data.len, to)?;
|
||||
write_u8(link.data.flags, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_link<R: Read>(from: &mut R) -> io::Result<Link<LinkInfo>> {
|
||||
let to_page_idx = read_u32(from)?;
|
||||
let start = read_u32(from)?;
|
||||
let len = read_u32(from)?;
|
||||
let flags = read_u8(from)?;
|
||||
|
||||
Ok(Link {
|
||||
to: to_page_idx,
|
||||
data: LinkInfo { start, len, flags },
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write_adjacency_list<W: Write>(
|
||||
al: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
to: &mut W,
|
||||
) -> io::Result<()> {
|
||||
write_u32(al.pages.len() as u32, to)?;
|
||||
write_u32(al.links.len() as u32, to)?;
|
||||
|
||||
for page in &al.pages {
|
||||
write_page(page, to)?;
|
||||
}
|
||||
|
||||
for link in &al.links {
|
||||
write_link(link, to)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_adjacency_list<R: Read>(from: &mut R) -> io::Result<AdjacencyList<PageInfo, LinkInfo>> {
|
||||
let n_pages = read_u32(from)?;
|
||||
let n_links = read_u32(from)?;
|
||||
|
||||
let mut pages = vec![];
|
||||
for _ in 0..n_pages {
|
||||
pages.push(read_page(from)?);
|
||||
}
|
||||
|
||||
let mut links = vec![];
|
||||
for _ in 0..n_links {
|
||||
links.push(read_link(from)?);
|
||||
}
|
||||
|
||||
Ok(AdjacencyList { pages, links })
|
||||
}
|
||||
|
|
@ -1,295 +0,0 @@
|
|||
use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct NodeIdx(pub u32);
|
||||
|
||||
impl NodeIdx {
|
||||
pub const NONE: Self = Self(u32::MAX);
|
||||
|
||||
#[inline]
|
||||
pub const fn new(value: usize) -> Self {
|
||||
Self(value as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub const fn usize(self) -> usize {
|
||||
self.0 as usize
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for NodeIdx {
|
||||
fn from(value: u32) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for NodeIdx {
|
||||
fn from(value: usize) -> Self {
|
||||
Self::new(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for NodeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign for NodeIdx {
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
self.0 += rhs.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for NodeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAssign for NodeIdx {
|
||||
fn sub_assign(&mut self, rhs: Self) {
|
||||
self.0 -= rhs.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<u32> for NodeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: u32) -> Self::Output {
|
||||
Self(self.0 + rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign<u32> for NodeIdx {
|
||||
fn add_assign(&mut self, rhs: u32) {
|
||||
self.0 += rhs;
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub<u32> for NodeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn sub(self, rhs: u32) -> Self::Output {
|
||||
Self(self.0 - rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAssign<u32> for NodeIdx {
|
||||
fn sub_assign(&mut self, rhs: u32) {
|
||||
self.0 -= rhs;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct EdgeIdx(pub u32);
|
||||
|
||||
impl EdgeIdx {
|
||||
#[inline]
|
||||
pub const fn new(value: usize) -> Self {
|
||||
Self(value as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub const fn usize(self) -> usize {
|
||||
self.0 as usize
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for EdgeIdx {
|
||||
fn from(value: u32) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for EdgeIdx {
|
||||
fn from(value: usize) -> Self {
|
||||
Self::new(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for EdgeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign for EdgeIdx {
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
self.0 += rhs.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for EdgeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAssign for EdgeIdx {
|
||||
fn sub_assign(&mut self, rhs: Self) {
|
||||
self.0 -= rhs.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<u32> for EdgeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: u32) -> Self::Output {
|
||||
Self(self.0 + rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign<u32> for EdgeIdx {
|
||||
fn add_assign(&mut self, rhs: u32) {
|
||||
self.0 += rhs;
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub<u32> for EdgeIdx {
|
||||
type Output = Self;
|
||||
|
||||
fn sub(self, rhs: u32) -> Self::Output {
|
||||
Self(self.0 - rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl SubAssign<u32> for EdgeIdx {
|
||||
fn sub_assign(&mut self, rhs: u32) {
|
||||
self.0 -= rhs;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Graph {
|
||||
/// A node points to the first of its edges.
|
||||
///
|
||||
/// A special case is that if the subsequent node points to the same edge,
|
||||
/// the current node has no edges.
|
||||
pub nodes: Vec<EdgeIdx>,
|
||||
|
||||
/// An edge points to a target node.
|
||||
///
|
||||
/// The source node is defined implicitly by the graph data structure.
|
||||
pub edges: Vec<NodeIdx>,
|
||||
}
|
||||
|
||||
impl Graph {
|
||||
pub fn with_capacity(nodes: usize, edges: usize) -> Self {
|
||||
Self {
|
||||
nodes: Vec::with_capacity(nodes),
|
||||
edges: Vec::with_capacity(edges),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_node(&mut self) {
|
||||
self.nodes.push(EdgeIdx::new(self.edges.len()));
|
||||
}
|
||||
|
||||
pub fn add_edge(&mut self, target: NodeIdx) {
|
||||
self.edges.push(target);
|
||||
}
|
||||
|
||||
pub fn check_consistency(&self) {
|
||||
if self.nodes.is_empty() {
|
||||
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
|
||||
return;
|
||||
}
|
||||
|
||||
assert!(self.nodes.len() < u32::MAX as usize, "too many nodes");
|
||||
assert!(self.edges.len() < u32::MAX as usize, "too many edges");
|
||||
|
||||
assert_eq!(
|
||||
*self.nodes.first().unwrap(),
|
||||
EdgeIdx(0),
|
||||
"first node pointer must be 0"
|
||||
);
|
||||
|
||||
for (ni, node) in self.nodes.iter().cloned().enumerate() {
|
||||
assert!(
|
||||
node.usize() <= self.edges.len(),
|
||||
"node pointers must be in range"
|
||||
);
|
||||
|
||||
if let Some(succ) = self.nodes.get(ni + 1) {
|
||||
assert!(node <= *succ, "node pointers must be well-ordered");
|
||||
}
|
||||
}
|
||||
|
||||
for edge in &self.edges {
|
||||
assert!(
|
||||
edge.usize() < self.nodes.len(),
|
||||
"edge pointers must be in range"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn nodes(&self) -> impl Iterator<Item = NodeIdx> + '_ {
|
||||
(0..self.nodes.len()).map(NodeIdx::new)
|
||||
}
|
||||
|
||||
pub fn edges(&self) -> impl Iterator<Item = (NodeIdx, NodeIdx)> + '_ {
|
||||
Edges::new(self)
|
||||
}
|
||||
|
||||
pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx {
|
||||
self.nodes
|
||||
.get(node.usize())
|
||||
.copied()
|
||||
.unwrap_or_else(|| self.edges.len().into())
|
||||
}
|
||||
|
||||
pub fn edge_range(&self, node: NodeIdx) -> Range<usize> {
|
||||
let start = self.nodes[node.usize()];
|
||||
let end = self.edge_start(node + 1);
|
||||
start.usize()..end.usize()
|
||||
}
|
||||
|
||||
pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] {
|
||||
&self.edges[self.edge_range(node)]
|
||||
}
|
||||
}
|
||||
|
||||
struct Edges<'a> {
|
||||
graph: &'a Graph,
|
||||
ni: NodeIdx,
|
||||
ei: EdgeIdx,
|
||||
}
|
||||
|
||||
impl<'a> Edges<'a> {
|
||||
fn new(graph: &'a Graph) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
ni: NodeIdx(0),
|
||||
ei: EdgeIdx(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for Edges<'_> {
|
||||
type Item = (NodeIdx, NodeIdx);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.ei.usize() >= self.graph.edges.len() {
|
||||
return None;
|
||||
}
|
||||
let target = self.graph.edges[self.ei.usize()];
|
||||
|
||||
// if would not be sufficient because some nodes may not have any edges.
|
||||
while self.ei >= self.graph.edge_start(self.ni + 1) {
|
||||
self.ni += 1;
|
||||
}
|
||||
let source = self.ni;
|
||||
|
||||
self.ei += 1;
|
||||
Some((source, target))
|
||||
}
|
||||
}
|
||||
|
|
@ -1,23 +1,51 @@
|
|||
mod algo;
|
||||
mod commands;
|
||||
pub mod commands;
|
||||
mod data;
|
||||
mod graph;
|
||||
mod util;
|
||||
|
||||
use std::{io, path::PathBuf};
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Parser;
|
||||
use data::Data;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Parser)]
|
||||
pub enum PhilosophyGameCmd {
|
||||
First,
|
||||
Canonical,
|
||||
Cluster,
|
||||
Trace { start: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
enum Command {
|
||||
Ingest(commands::ingest::Cmd),
|
||||
Export(commands::export::Cmd),
|
||||
Show(commands::show::Cmd),
|
||||
Stats(commands::stats::Cmd),
|
||||
Path(commands::path::Cmd),
|
||||
LongestPath(commands::longest_path::Cmd),
|
||||
Pg(commands::pg::Cmd),
|
||||
/// Read sift data on stdin and output brood data.
|
||||
Ingest,
|
||||
/// Read and reexport brood data.
|
||||
Reexport {
|
||||
to: PathBuf,
|
||||
#[arg(long, short = 'P')]
|
||||
in_parens: Option<bool>,
|
||||
#[arg(long, short = 'S')]
|
||||
in_structure: Option<bool>,
|
||||
#[arg(long, short = 'F')]
|
||||
filter: Option<PathBuf>,
|
||||
},
|
||||
/// Find a path from one article to another.
|
||||
Path {
|
||||
from: String,
|
||||
to: String,
|
||||
/// Flip start and end article.
|
||||
#[arg(short, long)]
|
||||
flip: bool,
|
||||
},
|
||||
/// Find the longest shortest path starting at an article.
|
||||
LongestShortestPath { from: String },
|
||||
/// Analyze articles using "Philosophy Game" rules.
|
||||
PhilosophyGame {
|
||||
#[command(subcommand)]
|
||||
subcmd: PhilosophyGameCmd,
|
||||
},
|
||||
/// Print all page titles.
|
||||
ListPages,
|
||||
}
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
|
|
@ -25,59 +53,31 @@ struct Args {
|
|||
datafile: PathBuf,
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
#[arg(long, short = 'P')]
|
||||
in_parens: Option<bool>,
|
||||
#[arg(long, short = 'S')]
|
||||
in_structure: Option<bool>,
|
||||
#[arg(long, short = 'R')]
|
||||
resolve_redirects: bool,
|
||||
#[arg(long, short = 'I')]
|
||||
invert_edges: bool,
|
||||
#[arg(long, short)]
|
||||
check_consistency: bool,
|
||||
}
|
||||
|
||||
fn main() -> io::Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
if let Command::Ingest(cmd) = &args.command {
|
||||
return cmd.run(&args.datafile);
|
||||
}
|
||||
|
||||
println!(">> Import");
|
||||
println!("> Reading data");
|
||||
let mut data = Data::read_from_file(&args.datafile)?;
|
||||
|
||||
if args.in_parens.is_some() || args.in_structure.is_some() {
|
||||
println!("> Filtering edges");
|
||||
algo::retain_edges(&mut data, |link| {
|
||||
args.in_parens.is_none_or(|b| b == link.in_parens())
|
||||
&& args.in_structure.is_none_or(|b| b == link.in_structure())
|
||||
});
|
||||
}
|
||||
|
||||
if args.resolve_redirects {
|
||||
println!("> Resolving redirects");
|
||||
algo::resolve_redirects(&mut data);
|
||||
}
|
||||
|
||||
if args.invert_edges {
|
||||
println!("> Inverting edges");
|
||||
algo::invert(&mut data);
|
||||
}
|
||||
|
||||
if args.check_consistency {
|
||||
println!("> Checking consistencey");
|
||||
data.check_consistency();
|
||||
}
|
||||
|
||||
match args.command {
|
||||
Command::Ingest(_) => unreachable!(),
|
||||
Command::Export(cmd) => cmd.run(data),
|
||||
Command::Show(cmd) => cmd.run(data),
|
||||
Command::Stats(cmd) => cmd.run(data),
|
||||
Command::Path(cmd) => cmd.run(data),
|
||||
Command::LongestPath(cmd) => cmd.run(data),
|
||||
Command::Pg(cmd) => cmd.run(data),
|
||||
Command::Ingest => commands::ingest::ingest(&args.datafile),
|
||||
Command::Reexport {
|
||||
to,
|
||||
in_parens,
|
||||
in_structure,
|
||||
filter,
|
||||
} => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure, filter),
|
||||
Command::Path { from, to, flip } => {
|
||||
if flip {
|
||||
commands::path::path(&args.datafile, &to, &from)
|
||||
} else {
|
||||
commands::path::path(&args.datafile, &from, &to)
|
||||
}
|
||||
}
|
||||
Command::LongestShortestPath { from } => {
|
||||
commands::longest_shortest_path::run(&args.datafile, &from)
|
||||
}
|
||||
Command::PhilosophyGame { subcmd } => {
|
||||
commands::philosophy_game::run(&args.datafile, subcmd)
|
||||
}
|
||||
Command::ListPages => commands::list_pages::run(&args.datafile),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,160 +1,39 @@
|
|||
use std::{collections::HashSet, fmt};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
use crate::{
|
||||
data::{Data, Page},
|
||||
graph::NodeIdx,
|
||||
use crate::data::{
|
||||
adjacency_list::{AdjacencyList, Page},
|
||||
info::{LinkInfo, PageInfo},
|
||||
};
|
||||
|
||||
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js
|
||||
struct PhpCharToUpper(char);
|
||||
pub fn normalize_link(link: &str) -> String {
|
||||
let link = link.trim().replace(' ', "_");
|
||||
|
||||
impl fmt::Display for PhpCharToUpper {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.0 {
|
||||
// Do something special, I guess
|
||||
'ᾀ' => write!(f, "ᾈ"),
|
||||
'ᾁ' => write!(f, "ᾉ"),
|
||||
'ᾂ' => write!(f, "ᾊ"),
|
||||
'ᾃ' => write!(f, "ᾋ"),
|
||||
'ᾄ' => write!(f, "ᾌ"),
|
||||
'ᾅ' => write!(f, "ᾍ"),
|
||||
'ᾆ' => write!(f, "ᾎ"),
|
||||
'ᾇ' => write!(f, "ᾏ"),
|
||||
'ᾐ' => write!(f, "ᾘ"),
|
||||
'ᾑ' => write!(f, "ᾙ"),
|
||||
'ᾒ' => write!(f, "ᾚ"),
|
||||
'ᾓ' => write!(f, "ᾛ"),
|
||||
'ᾔ' => write!(f, "ᾜ"),
|
||||
'ᾕ' => write!(f, "ᾝ"),
|
||||
'ᾖ' => write!(f, "ᾞ"),
|
||||
'ᾗ' => write!(f, "ᾟ"),
|
||||
'ᾠ' => write!(f, "ᾨ"),
|
||||
'ᾡ' => write!(f, "ᾩ"),
|
||||
'ᾢ' => write!(f, "ᾪ"),
|
||||
'ᾣ' => write!(f, "ᾫ"),
|
||||
'ᾤ' => write!(f, "ᾬ"),
|
||||
'ᾥ' => write!(f, "ᾭ"),
|
||||
'ᾦ' => write!(f, "ᾮ"),
|
||||
'ᾧ' => write!(f, "ᾯ"),
|
||||
'ᾳ' => write!(f, "ᾼ"),
|
||||
'ῃ' => write!(f, "ῌ"),
|
||||
'ῳ' => write!(f, "ῼ"),
|
||||
|
||||
// Do not capitalize
|
||||
'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ'
|
||||
| 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ'
|
||||
| 'ფ' | 'ქ' | 'ღ' | 'ყ' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ'
|
||||
| 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | 'ჿ' | 'ᶎ' | 'ẖ'
|
||||
| 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ'
|
||||
| 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ'
|
||||
| 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ'
|
||||
| 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ'
|
||||
| 'ⅰ' | 'ⅱ' | 'ⅲ' | 'ⅳ' | 'ⅴ' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | 'ⅹ' | 'ⅺ' | 'ⅻ' | 'ⅼ' | 'ⅽ'
|
||||
| 'ⅾ' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ'
|
||||
| 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ'
|
||||
| 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ'
|
||||
| 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩'
|
||||
| '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷'
|
||||
| '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
|
||||
// Capitalize normally
|
||||
c => write!(f, "{}", c.to_uppercase()),
|
||||
}
|
||||
}
|
||||
// Make only first char lowercase
|
||||
link.chars()
|
||||
.next()
|
||||
.iter()
|
||||
.flat_map(|c| c.to_lowercase())
|
||||
.chain(link.chars().skip(1))
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
pub struct TitleNormalizer {
|
||||
strip_bidi: Regex,
|
||||
clean_up_whitespace: Regex,
|
||||
trim_underscore_start: Regex,
|
||||
trim_underscore_end: Regex,
|
||||
}
|
||||
|
||||
impl TitleNormalizer {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(),
|
||||
|
||||
clean_up_whitespace: Regex::new(concat!(
|
||||
"[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}",
|
||||
"\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+"
|
||||
))
|
||||
.unwrap(),
|
||||
|
||||
trim_underscore_start: Regex::new("^_+").unwrap(),
|
||||
|
||||
trim_underscore_end: Regex::new("_+$").unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize an article title.
|
||||
///
|
||||
/// See also <https://github.com/wikimedia/mediawiki-title>.
|
||||
pub fn normalize(&self, title: &str) -> String {
|
||||
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403
|
||||
|
||||
// Strip Unicode bidi override characters
|
||||
let title = self.strip_bidi.replace_all(title, "");
|
||||
|
||||
// Clean up whitespace
|
||||
let title = self.clean_up_whitespace.replace_all(&title, "_");
|
||||
|
||||
// Trim _ from beginning and end
|
||||
let title = self.trim_underscore_start.replace_all(&title, "");
|
||||
let title = self.trim_underscore_end.replace_all(&title, "");
|
||||
|
||||
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206
|
||||
let Some(first) = title.chars().next() else {
|
||||
return String::new();
|
||||
};
|
||||
let rest = &title[first.len_utf8()..];
|
||||
format!("{}{rest}", PhpCharToUpper(first))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
|
||||
let normalized = normalizer.normalize(title);
|
||||
data.pages
|
||||
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> u32 {
|
||||
let title = normalize_link(title);
|
||||
pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find(|(_, p)| normalizer.normalize(&p.title) == normalized)
|
||||
.map(|(i, _)| NodeIdx::new(i))
|
||||
.expect("invalid title")
|
||||
.find(|(_, p)| normalize_link(&p.data.title) == title)
|
||||
.map(|(i, _)| i)
|
||||
.expect("invalid title") as u32
|
||||
}
|
||||
|
||||
pub fn resolve_redirects(data: &Data, node: NodeIdx) -> NodeIdx {
|
||||
let mut curr = node;
|
||||
let mut seen = HashSet::new();
|
||||
|
||||
seen.insert(curr);
|
||||
while let Some(target) = data.redirect_target(curr) {
|
||||
if seen.contains(&target) {
|
||||
println!(
|
||||
" Redirect cycle deteted: {:?}",
|
||||
data.pages[node.usize()].title
|
||||
);
|
||||
break;
|
||||
pub fn resolve_redirects(data: &AdjacencyList<PageInfo, LinkInfo>, mut page_idx: u32) -> u32 {
|
||||
loop {
|
||||
if data.page(page_idx).data.redirect {
|
||||
if let Some(link_idx) = data.link_redirect(page_idx) {
|
||||
page_idx = data.link(link_idx).to;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
seen.insert(target);
|
||||
curr = target;
|
||||
}
|
||||
|
||||
curr
|
||||
}
|
||||
|
||||
pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
|
||||
resolve_redirects(data, locate_title(normalizer, data, title))
|
||||
}
|
||||
|
||||
pub fn fmt_page(page: &Page) -> String {
|
||||
if page.redirect {
|
||||
format!("v {}", page.title)
|
||||
} else {
|
||||
format!("- {}", page.title)
|
||||
return page_idx;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
11
sift/sift.py
11
sift/sift.py
|
|
@ -172,21 +172,16 @@ def process_xmldump_page(page):
|
|||
# Page info as simple tuples
|
||||
def simple_pages(input):
|
||||
dump = mwxml.Dump.from_file(sys.stdin)
|
||||
articles = 0
|
||||
for i, page in enumerate(dump.pages):
|
||||
if (i + 1) % 1000 == 0:
|
||||
# Yeah, the articles are usually off by one
|
||||
eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}")
|
||||
|
||||
if page.namespace != 0:
|
||||
continue
|
||||
|
||||
articles += 1
|
||||
if (i + 1) % 1000 == 0:
|
||||
eprint(f"{i+1:8} pages, at pid {page.id:8}")
|
||||
|
||||
[revision] = list(page) # Every page has exactly one revision
|
||||
yield page.id, page.title, revision.text or "", page.redirect
|
||||
|
||||
eprint(f"{articles} articles total")
|
||||
|
||||
|
||||
def process_simple_page(info):
|
||||
pid, title, text, redirect = info
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue