Compare commits

...
Sign in to create a new pull request.

36 commits

Author SHA1 Message Date
ee66509dd8 Remove now-obsolete Counter 2025-01-03 02:43:40 +01:00
d9fd29c1c3 Add progress bars to ingest command 2025-01-01 15:59:39 +01:00
8016bbfc83 Port and rename pg command 2025-01-01 00:59:03 +01:00
b2a8597c6f Add longest-path command 2024-12-31 19:54:43 +01:00
3045d6d6c6 Improve link degree formatting 2024-12-31 19:05:59 +01:00
698b6590d1 Remove unused method 2024-12-31 18:27:18 +01:00
cdf9a7d7ae Compute article rankings by link degrees 2024-12-31 18:26:12 +01:00
5b8feb6368 Add general stats command 2024-12-31 17:40:20 +01:00
76efd6d728 Add redirects command 2024-12-31 17:14:21 +01:00
e90864a097 Add -I and -c cli options 2024-12-31 16:41:19 +01:00
04482f9f2f Detect redirect cycles 2024-12-31 16:40:55 +01:00
1f20e0519a Fix off-by-one in consistency check 2024-12-31 16:40:42 +01:00
6611dd3160 Add --bidi flag to path command 2024-12-31 15:51:58 +01:00
535d7ff236 Add export command 2024-12-31 15:38:30 +01:00
c573f1b0b0 Allow transforming graph before commands 2024-12-31 15:38:30 +01:00
ab7b7295ca Remove unused code 2024-12-31 15:38:30 +01:00
693ae9eb81 Show more info and optionally links 2024-12-31 15:38:30 +01:00
ceb987bbbc Add show command 2024-12-31 15:16:35 +01:00
aa4187fcd8 Group pages, links, and graph in Data struct 2024-12-31 13:07:26 +01:00
0168373509 Move dijkstra to new file 2024-12-31 13:07:26 +01:00
6ca20c9740 Remove some old code 2024-12-31 13:06:46 +01:00
4e41084f2a Port path command 2024-12-31 13:06:46 +01:00
abd6b3519c Get rid of rustc_hash 2024-12-31 13:06:46 +01:00
e04215802e Speed up ingest using rustc_hash
An enwiki ingest went from ca. 6:50 minutes down to ca. 7:00 minutes. Oh
wait...

This was not a rigorous test, but rustc_hash doesn't seem to have a
significant positive impact. Maybe I'm just holding it wrong, but right
now I'd rather remove it again and have simpler code/deps.
2024-12-31 13:06:46 +01:00
eb631250d7 Fix ingest logic yet again 2024-12-31 13:06:46 +01:00
a3d0136ad2 Fix ingest logic and panics 2024-12-31 13:06:46 +01:00
3aa8222b6b Rewrite ingest command 2024-12-31 13:06:46 +01:00
f819f5bf69 Remove adjlist-based path implementation 2024-12-31 13:06:46 +01:00
18e54c4ce1 Implement new graph data structure and dijkstra 2024-12-31 00:47:14 +01:00
778cb6748d Remove petgraph 2024-12-30 16:00:09 +01:00
34df6c9f14 Try out petgraph 2024-12-30 13:12:49 +01:00
e3e191b748 Improve philosophy game trace output 2024-12-30 13:12:25 +01:00
49665f74ce List links to and from an article 2024-12-30 13:12:14 +01:00
f5f4f99a2f Update dependencies 2024-12-29 23:28:55 +01:00
c2c1b1234c Change link data with page info 2024-12-29 23:22:02 +01:00
7af2a4e06f Print nicer sift stats 2024-12-29 20:48:52 +01:00
27 changed files with 2229 additions and 1274 deletions

335
brood/Cargo.lock generated
View file

@ -1,12 +1,21 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.14"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
@ -19,33 +28,33 @@ dependencies = [
[[package]]
name = "anstyle"
version = "1.0.7"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.4"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.0"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.3"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys",
@ -56,16 +65,30 @@ name = "brood"
version = "0.0.0"
dependencies = [
"clap",
"rustc-hash",
"indicatif",
"regex",
"serde",
"serde_json",
"thousands",
]
[[package]]
name = "clap"
version = "4.5.7"
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
dependencies = [
"clap_builder",
"clap_derive",
@ -73,9 +96,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.7"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
dependencies = [
"anstream",
"anstyle",
@ -85,9 +108,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.5.5"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
@ -97,15 +120,34 @@ dependencies = [
[[package]]
name = "clap_lex"
version = "0.7.1"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "colorchoice"
version = "1.0.1"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "console"
version = "0.15.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"unicode-width",
"windows-sys",
]
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "heck"
@ -114,40 +156,122 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.0"
name = "indicatif"
version = "0.17.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
dependencies = [
"console",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.11"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "js-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "proc-macro2"
version = "1.0.86"
version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-hash"
version = "2.0.0"
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "ryu"
@ -157,18 +281,18 @@ checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "serde"
version = "1.0.203"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.203"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
dependencies = [
"proc-macro2",
"quote",
@ -177,11 +301,12 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.118"
version = "1.0.134"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
@ -194,9 +319,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.68"
version = "2.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058"
dependencies = [
"proc-macro2",
"quote",
@ -204,10 +329,22 @@ dependencies = [
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
name = "thousands"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820"
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "unicode-width"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]]
name = "utf8parse"
@ -216,19 +353,83 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "windows-sys"
version = "0.52.0"
name = "wasm-bindgen"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
@ -242,48 +443,48 @@ dependencies = [
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.5"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View file

@ -4,7 +4,9 @@ version = "0.0.0"
edition = "2021"
[dependencies]
clap = { version = "4.5.7", features = ["derive", "deprecated"] }
rustc-hash = "2.0.0"
serde = { version = "1.0.203", features = ["derive"] }
serde_json = "1.0.118"
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
indicatif = "0.17.9"
regex = "1.11.1"
serde = { version = "1.0.217", features = ["derive"] }
serde_json = "1.0.134"
thousands = "0.2.0"

4
brood/src/algo.rs Normal file
View file

@ -0,0 +1,4 @@
mod dijkstra;
mod edit;
pub use self::{dijkstra::*, edit::*};

View file

@ -0,0 +1,77 @@
use std::{cmp::Reverse, collections::BinaryHeap};
use crate::graph::{EdgeIdx, Graph, NodeIdx};
pub struct Dijkstra<'a> {
graph: &'a Graph,
cost: Vec<u32>,
pred: Vec<NodeIdx>,
}
impl<'a> Dijkstra<'a> {
pub fn new(graph: &'a Graph) -> Self {
Self {
graph,
cost: vec![u32::MAX; graph.nodes.len()],
pred: vec![NodeIdx::NONE; graph.nodes.len()],
}
}
pub fn run(
&mut self,
start: NodeIdx,
goal: impl Fn(NodeIdx) -> bool,
cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32,
) {
self.cost[start.usize()] = 0;
let mut queue = BinaryHeap::new();
queue.push((Reverse(0), start));
while let Some((Reverse(curr_cost), curr)) = queue.pop() {
if goal(curr) {
break; // We've found the shortest path to our target
}
// These seem to never actually occur
// if curr_cost > self.cost[curr.usize()] {
// continue; // Outdated entry
// }
for edge in self.graph.edge_range(curr).map(EdgeIdx::new) {
let next = self.graph.edges[edge.usize()];
let next_cost = curr_cost + cost(curr, edge, next);
if next_cost < self.cost[next.usize()] {
self.cost[next.usize()] = next_cost;
self.pred[next.usize()] = curr;
queue.push((Reverse(next_cost), next));
}
}
}
}
#[inline]
pub fn cost(&self, node: NodeIdx) -> u32 {
self.cost[node.usize()]
}
#[inline]
pub fn pred(&self, node: NodeIdx) -> NodeIdx {
self.pred[node.usize()]
}
pub fn path(&self, goal: NodeIdx) -> Vec<NodeIdx> {
let mut path = vec![];
let mut at = goal;
loop {
path.push(at);
at = self.pred(at);
if at == NodeIdx::NONE {
break;
}
}
path.reverse();
path
}
}

97
brood/src/algo/edit.rs Normal file
View file

@ -0,0 +1,97 @@
use std::mem;
use crate::{
data::{Data, Link},
graph::NodeIdx,
util,
};
pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) {
let mut links = mem::take(&mut data.links).into_iter();
let graph = mem::take(&mut data.graph);
for node in graph.nodes() {
data.graph.add_node();
for edge in graph.edge_slice(node) {
let link = links.next().unwrap();
if f(&link) {
data.links.push(link);
data.graph.add_edge(*edge);
}
}
}
}
pub fn resolve_redirects(data: &mut Data) {
// Permutation from input node to input node
let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()];
for node in data.graph.nodes() {
perm_redirect[node.usize()] = util::resolve_redirects(data, node);
}
// Permutation from input node to final node
let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()];
let mut perm_retain_count = NodeIdx(0);
for (i, page) in data.pages.iter().enumerate() {
if !page.redirect {
perm_retain[i] = perm_retain_count;
perm_retain_count += 1;
}
}
let mut pages = mem::take(&mut data.pages).into_iter();
let mut links = mem::take(&mut data.links).into_iter();
let graph = mem::take(&mut data.graph);
for node in graph.nodes() {
let page = pages.next().unwrap();
let new_node = perm_retain[node.usize()];
if new_node == NodeIdx::NONE {
// Skip all edges
for _ in graph.edge_slice(node) {
links.next().unwrap();
}
continue;
}
data.pages.push(page);
data.graph.add_node();
for edge in graph.edge_slice(node) {
let link = links.next().unwrap();
let new_edge = perm_retain[perm_redirect[edge.usize()].usize()];
if new_edge == NodeIdx::NONE {
continue;
}
data.links.push(link);
data.graph.add_edge(new_edge);
}
}
}
pub fn invert(data: &mut Data) {
let links = mem::take(&mut data.links);
let graph = mem::take(&mut data.graph);
let mut edges = graph
.edges()
.zip(links)
.map(|((source, target), link)| (source, target, link))
.collect::<Vec<_>>();
edges.sort_by_key(|(_, target, _)| *target);
let mut edges = edges.into_iter().peekable();
for node in graph.nodes() {
data.graph.add_node();
while edges.peek().is_some_and(|(_, target, _)| *target <= node) {
let (source, _, link) = edges.next().unwrap();
data.graph.add_edge(source);
data.links.push(link);
}
}
}

View file

@ -1,6 +1,7 @@
pub mod export;
pub mod ingest;
pub mod list_pages;
pub mod longest_shortest_path;
pub mod longest_path;
pub mod path;
pub mod philosophy_game;
pub mod reexport;
pub mod pg;
pub mod show;
pub mod stats;

View file

@ -0,0 +1,17 @@
use std::{io, path::PathBuf};
use crate::data::Data;
#[derive(Debug, clap::Parser)]
pub struct Cmd {
out: PathBuf,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
println!(">> Export");
data.write_to_file(&self.out)?;
Ok(())
}
}

View file

@ -1,16 +1,33 @@
use std::collections::hash_map::Entry;
use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter};
use std::path::Path;
use std::u32;
use std::{
collections::{hash_map::Entry, HashMap},
fs::File,
io::{self, BufRead, BufReader, Seek},
path::{Path, PathBuf},
};
use rustc_hash::FxHashMap;
use indicatif::{ProgressBar, ProgressStyle};
use serde::Deserialize;
use thousands::Separable;
use crate::data::adjacency_list::{AdjacencyList, Page};
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
use crate::{
data::{Data, Link, Page},
graph::NodeIdx,
util::TitleNormalizer,
};
const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ ";
fn seek_to_start(f: &mut BufReader<File>) -> io::Result<u64> {
let size = f.seek(io::SeekFrom::End(0))?;
f.seek(io::SeekFrom::Start(0))?;
Ok(size)
}
fn file_progress_style() -> ProgressStyle {
ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}")
.unwrap()
.progress_chars(PROGRESS_CHARS)
}
#[derive(Deserialize)]
struct JsonPage {
@ -21,151 +38,161 @@ struct JsonPage {
redirect: Option<String>,
}
/*
Importing is a tad complicated because of multiple criteria:
fn read_titles(f: &mut BufReader<File>) -> io::Result<Vec<String>> {
let size = seek_to_start(f)?;
let bar = ProgressBar::new(size).with_style(file_progress_style());
1. The data must be read in a single pass on stdin
2. The process should not consume a lot of memory
(can't store the decoded json data directly)
3. The process should result in a nice and compact adjacency list format
let mut titles = vec![];
Because of this, the import is a bit more complex and has two passes.
for line in bar.wrap_read(f).lines() {
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
titles.push(page.title);
}
The first pass imports the data into an adjacency-list-like format, but the
`Link::to` field points to a title in `Titles` instead of a page.
The second pass then resolves the links to page indices and throws away all
links that don't point to any known page.
*/
#[derive(Default)]
struct Titles {
/// Normalized titles
titles: Vec<String>,
/// Map from normalized title to index in [`Self::titles`].
map: FxHashMap<String, u32>,
Ok(titles)
}
impl Titles {
fn insert(&mut self, title: String) -> u32 {
match self.map.entry(title.clone()) {
Entry::Occupied(occupied) => *occupied.get(),
Entry::Vacant(vacant) => {
let idx = self.titles.len() as u32;
self.titles.push(title);
vacant.insert(idx);
idx
/// Returns a map from normalized title to the index in the brood data where the
/// article will appear.
///
/// Titles in the title list are not always unique. When multiple identical
/// titles appear, all but one have to be discarded. Originally, I tried to be
/// smart and keep the last occurrence (under the assumption that its data would
/// be the newest), but this led to index-based bugs. Because of this, I now
/// keep the first occurrence.
fn compute_title_lookup(
normalizer: &TitleNormalizer,
titles: &[String],
) -> HashMap<String, (u32, u32)> {
let mut title_lookup = HashMap::<String, (u32, u32)>::new();
let bar = ProgressBar::new(titles.len() as u64)
.with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS));
for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() {
// The index where this article will appear in the final list, assuming
// it is not a duplicate. For ownership reasons, we compute this here
// instead of inside the Entry::Vacant branch of the following match.
let brood_i = title_lookup.len();
match title_lookup.entry(normalizer.normalize(title)) {
Entry::Vacant(entry) => {
entry.insert((sift_i as u32, brood_i as u32));
}
Entry::Occupied(entry) => {
let prev_sift_i = entry.get().0;
let prev = &titles[prev_sift_i as usize];
if prev == title {
bar.println(format!(
" {title:?} ({prev_sift_i}) occurs again at {sift_i}"
));
} else {
bar.println(format!(
" {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}",
normalizer.normalize(title)
));
}
}
}
}
fn get(&self, i: u32) -> &str {
&self.titles[i as usize]
}
title_lookup
}
fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
let mut titles = Titles::default();
let mut result = AdjacencyList::default();
fn read_page_data(
normalizer: &TitleNormalizer,
title_lookup: &HashMap<String, (u32, u32)>,
f: &mut BufReader<File>,
) -> io::Result<Data> {
let size = seek_to_start(f)?;
let bar = ProgressBar::new(size).with_style(file_progress_style());
let stdin = BufReader::new(io::stdin());
for (i, line) in stdin.lines().enumerate() {
let json_page = serde_json::from_str::<JsonPage>(&line?).unwrap();
let mut data = Data::new();
result.push_page(PageInfo {
id: json_page.id,
length: json_page.length,
redirect: json_page.redirect.is_some(),
title: json_page.title,
for (i, line) in bar.wrap_read(f).lines().enumerate() {
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
let normalized = normalizer.normalize(&page.title);
let (sift_i, _) = title_lookup[&normalized];
if i as u32 != sift_i {
// Articles may occur multiple times, and this is not the instance
// of the article we should keep.
bar.println(format!(
" Skipping {:?} ({i}) in favor of {sift_i}",
page.title
));
continue;
}
data.graph.add_node();
data.pages.push(Page {
id: page.id,
title: page.title,
length: page.length,
redirect: page.redirect.is_some(),
});
if let Some(to) = json_page.redirect {
let to = titles.insert(util::normalize_link(&to));
result.push_link(to, LinkInfo::default());
} else {
for (to, start, len, flags) in json_page.links {
let to = titles.insert(util::normalize_link(&to));
result.push_link(to, LinkInfo { start, len, flags });
}
let mut page_links = page.links;
if let Some(target) = page.redirect {
page_links.clear();
let len = target.len() as u32;
page_links.push((target, 0, len, 0));
}
if (i + 1) % 100_000 == 0 {
eprintln!("{} pages imported", i + 1)
}
}
eprintln!("Pages: {}", result.pages.len());
eprintln!("Links: {}", result.links.len());
eprintln!("Titles: {}", titles.titles.len());
eprintln!("Title map entries: {}", titles.map.len());
Ok((result, titles))
}
/// Create map from normalized title to index in pages.
fn initialize_pages_map(pages: &[Page<PageInfo>]) -> FxHashMap<String, u32> {
let mut result = FxHashMap::default();
for (i, p) in pages.iter().enumerate() {
match result.entry(util::normalize_link(&p.data.title)) {
Entry::Occupied(entry) => {
eprintln!(
"{:?} already exists at index {} as {:?}",
p.data.title,
entry.get(),
util::normalize_link(&p.data.title)
);
}
Entry::Vacant(entry) => {
entry.insert(i as u32);
for (target, start, len, flags) in page_links {
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
data.graph.add_edge(NodeIdx(*brood_i));
data.links.push(Link { start, len, flags });
}
}
}
result
Ok(data)
}
fn second_stage(
first_stage: &AdjacencyList<PageInfo, LinkInfo>,
titles: &Titles,
) -> AdjacencyList<PageInfo, LinkInfo> {
let pages_map = initialize_pages_map(&first_stage.pages);
let mut result = AdjacencyList::default();
/// Convert sift data to brood data.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
/// The sift data file to ingest.
data: PathBuf,
}
for (page_idx, page) in first_stage.pages() {
result.push_page(page.data.clone());
impl Cmd {
pub fn run(&self, brood_data: &Path) -> io::Result<()> {
let normalizer = TitleNormalizer::new();
for (_, link) in first_stage.links(page_idx) {
let title = util::normalize_link(titles.get(link.to));
if let Some(to) = pages_map.get(&title) {
// The link points to an existing article, we should keep it
result.push_link(*to, link.data);
}
}
println!(">> First pass");
let mut sift_data = BufReader::new(File::open(&self.data)?);
if (page_idx + 1) % 100_000 == 0 {
eprintln!("{} pages imported", page_idx + 1)
}
println!("> Reading titles");
let titles = read_titles(&mut sift_data)?;
println!("> Computing title index lookup table");
let title_lookup = compute_title_lookup(&normalizer, &titles);
drop(titles); // Don't hoard memory
println!(">> Second pass");
println!("> Reading page data");
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
assert_eq!(data.pages.len(), title_lookup.len());
drop(title_lookup); // Don't hoard memory
drop(sift_data); // No longer needed
println!("> Checking consistency");
data.check_consistency();
println!(">> Export");
println!(
"Pages: {:>13}",
data.pages.len().separate_with_underscores()
);
println!(
"Links: {:>13}",
data.links.len().separate_with_underscores()
);
data.write_to_file(brood_data)?;
Ok(())
}
eprintln!("Pages: {}", result.pages.len());
eprintln!("Links: {}", result.links.len());
eprintln!("Page map entries: {}", pages_map.len());
result
}
pub fn ingest(datafile: &Path) -> io::Result<()> {
eprintln!(">> First stage");
let (first_stage, titles) = first_stage()?;
eprintln!(">> Second stage");
let data = second_stage(&first_stage, &titles);
eprintln!(">> Consistency check");
data.check_consistency();
eprintln!(">> Export");
let mut datafile = BufWriter::new(File::create(datafile)?);
store::write_adjacency_list(&data, &mut datafile)?;
Ok(())
}

View file

@ -1,23 +0,0 @@
use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use crate::data::store;
pub fn run(datafile: &Path) -> io::Result<()> {
let mut databuf = BufReader::new(File::open(datafile)?);
let data = store::read_adjacency_list(&mut databuf)?;
for (page_idx, page) in data.pages() {
if page.data.redirect {
for link_idx in data.link_range(page_idx) {
let target_page = data.page(data.link(link_idx).to);
println!("{:?} -> {:?}", page.data.title, target_page.data.title);
}
} else {
println!("{:?}", page.data.title);
}
}
Ok(())
}

View file

@ -0,0 +1,70 @@
use std::io;
use crate::{
algo::Dijkstra,
data::Data,
graph::NodeIdx,
util::{self, TitleNormalizer},
};
/// Find the article with the longest shortest path away from the starting
/// article.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
start: String,
#[arg(long, short, default_value_t = 1)]
top: usize,
}
fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec<NodeIdx>)>) {
let start = &data.pages[start.usize()].title;
let goal = &data.pages[goal.usize()].title;
let Some((cost, path)) = path else {
println!("No path found from {start} to {goal}");
return;
};
println!("Path found (cost {cost}, length {}):", path.len());
for page in path {
println!("{}", util::fmt_page(&data.pages[page.usize()]));
}
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new();
println!(">> Resolve article");
let start = util::resolve_title(&normalizer, &data, &self.start);
println!("Start: {}", data.pages[start.usize()].title);
println!(">> Search paths");
println!("> Preparing dijkstra");
let mut dijkstra = Dijkstra::new(&data.graph);
println!("> Running dijkstra");
dijkstra.run(
start,
|_| false,
|source, _edge, _target| !data.pages[source.usize()].redirect as u32,
);
println!(">> Find longest paths");
let mut costs = data
.graph
.nodes()
.map(|n| (dijkstra.cost(n), n))
.filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please
.collect::<Vec<_>>();
costs.sort_unstable();
for (cost, goal) in costs.iter().rev().take(self.top) {
let path = dijkstra.path(*goal);
println!();
print_path(&data, start, *goal, Some((*cost, path)));
}
Ok(())
}
}

View file

@ -1,173 +0,0 @@
use std::collections::BinaryHeap;
use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use crate::data::adjacency_list::AdjacencyList;
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
struct DijkstraPageInfo {
cost: u32,
/// Index of the previous page.
prev: u32,
redirect: bool,
}
impl DijkstraPageInfo {
fn from_page_info(info: PageInfo) -> Self {
Self {
cost: u32::MAX,
prev: u32::MAX,
redirect: info.redirect,
}
}
}
struct DijkstraLinkInfo {
cost: u32,
}
impl DijkstraLinkInfo {
fn from_link_info(info: LinkInfo) -> Self {
Self {
cost: 1,
// cost: 1000 + info.start,
// cost: 10000 + info.start,
// cost: 1000 + info.start / 10,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
struct Entry {
cost: u32,
page_idx: u32,
}
impl Entry {
pub fn new(cost: u32, page_idx: u32) -> Self {
Self { cost, page_idx }
}
}
// Manual implementation so the queue is a min-heap instead of a max-heap.
impl Ord for Entry {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other
.cost
.cmp(&self.cost)
.then_with(|| self.page_idx.cmp(&other.page_idx))
}
}
impl PartialOrd for Entry {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
/// Closely matches the dijkstra example in [std::collections::binary_heap].
fn full_dijkstra(
data: AdjacencyList<PageInfo, LinkInfo>,
from: u32,
) -> AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo> {
println!("> Prepare state");
let mut data = data
.change_page_data(DijkstraPageInfo::from_page_info)
.change_link_data(DijkstraLinkInfo::from_link_info);
let mut queue = BinaryHeap::new();
data.page_mut(from).data.cost = 0;
queue.push(Entry::new(0, from));
println!("> Run dijkstra");
while let Some(Entry { cost, page_idx }) = queue.pop() {
let page = data.page(page_idx);
if cost > page.data.cost {
// This queue entry is outdated
continue;
}
let redirect = page.data.redirect;
for link_idx in data.link_range(page_idx) {
let link = data.link(link_idx);
let next = Entry {
cost: cost + if redirect { 0 } else { link.data.cost },
page_idx: link.to,
};
let target_page = data.page_mut(link.to);
if next.cost < target_page.data.cost {
target_page.data.cost = next.cost;
target_page.data.prev = page_idx;
queue.push(next);
}
}
}
data
}
fn find_longest_shortest_path(
data: AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo>,
from: u32,
) -> Option<Vec<u32>> {
let to = data
.pages
.iter()
.enumerate()
.filter(|(_, p)| p.data.cost != u32::MAX)
.max_by_key(|(_, p)| p.data.cost)?
.0 as u32;
let mut steps = vec![];
let mut at = to;
loop {
steps.push(at);
at = data.page(at).data.prev;
if at == u32::MAX {
break;
};
}
steps.reverse();
if steps.first() == Some(&from) {
Some(steps)
} else {
None
}
}
pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
println!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?);
let data = store::read_adjacency_list(&mut databuf)?;
let pages = data.pages.clone();
println!(">> Locate from and to");
let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from));
println!("From: {:?}", data.page(from_idx).data.title);
println!(">> Find all shortest paths");
let data = full_dijkstra(data, from_idx);
println!(">> Find longest shortest path");
let path = find_longest_shortest_path(data, from_idx);
if let Some(path) = path {
println!("Path found:");
for page_idx in path {
let page = &pages[page_idx as usize];
if page.data.redirect {
println!(" v {:?}", page.data.title);
} else {
println!(" - {:?}", page.data.title);
}
}
} else {
println!("No path found");
}
Ok(())
}

View file

@ -1,159 +1,87 @@
use std::collections::BinaryHeap;
use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use std::io;
use crate::data::adjacency_list::AdjacencyList;
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
use crate::{
algo::Dijkstra,
data::Data,
graph::NodeIdx,
util::{self, TitleNormalizer},
};
struct DijkstraPageInfo {
cost: u32,
prev: u32,
redirect: bool,
/// Find the shortest path between two articles.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
start: String,
goal: String,
// Search for a path in both directions.
#[arg(long, short)]
bidi: bool,
}
impl DijkstraPageInfo {
fn from_page_info(info: PageInfo) -> Self {
Self {
cost: u32::MAX,
prev: u32::MAX,
redirect: info.redirect,
}
fn search_path(data: &Data, start: NodeIdx, goal: NodeIdx) -> Option<(u32, Vec<NodeIdx>)> {
println!("> Preparing dijkstra");
let mut dijkstra = Dijkstra::new(&data.graph);
println!("> Running dijkstra");
dijkstra.run(
start,
|node| node == goal,
|source, _edge, _target| !data.pages[source.usize()].redirect as u32,
);
if dijkstra.cost(goal) == u32::MAX {
return None;
}
println!("> Collecting path");
let cost = dijkstra.cost(goal);
let path = dijkstra.path(goal);
Some((cost, path))
}
fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec<NodeIdx>)>) {
let start = &data.pages[start.usize()].title;
let goal = &data.pages[goal.usize()].title;
let Some((cost, path)) = path else {
println!("No path found from {start} to {goal}");
return;
};
println!("Path found (cost {cost}, length {}):", path.len());
for page in path {
println!("{}", util::fmt_page(&data.pages[page.usize()]));
}
}
struct DijkstraLinkInfo {
cost: u32,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new();
impl DijkstraLinkInfo {
fn from_link_info(info: LinkInfo) -> Self {
Self {
cost: 1,
// cost: 1000 + info.start,
// cost: 10000 + info.start,
// cost: 1000 + info.start / 10,
}
}
}
println!(">> Resolve articles");
let start = util::resolve_title(&normalizer, &data, &self.start);
let goal = util::resolve_title(&normalizer, &data, &self.goal);
println!("Start: {}", data.pages[start.usize()].title);
println!("Goal: {}", data.pages[goal.usize()].title);
#[derive(Clone, Copy, PartialEq, Eq)]
struct Entry {
cost: u32,
page_idx: u32,
}
if self.bidi {
println!(">> Find path forward");
let forward = search_path(&data, start, goal);
println!(">> Find path backward");
let backward = search_path(&data, goal, start);
impl Entry {
pub fn new(cost: u32, page_idx: u32) -> Self {
Self { cost, page_idx }
}
}
println!();
print_path(&data, start, goal, forward);
println!();
print_path(&data, goal, start, backward);
} else {
println!(">> Find path");
let path = search_path(&data, start, goal);
// Manual implementation so the queue is a min-heap instead of a max-heap.
impl Ord for Entry {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other
.cost
.cmp(&self.cost)
.then_with(|| self.page_idx.cmp(&other.page_idx))
}
}
impl PartialOrd for Entry {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
/// Closely matches the dijkstra example in [std::collections::binary_heap].
fn dijkstra(data: AdjacencyList<PageInfo, LinkInfo>, from: u32, to: u32) -> Option<Vec<u32>> {
println!("> Prepare state");
let mut data = data
.change_page_data(DijkstraPageInfo::from_page_info)
.change_link_data(DijkstraLinkInfo::from_link_info);
let mut queue = BinaryHeap::new();
data.page_mut(from).data.cost = 0;
queue.push(Entry::new(0, from));
println!("> Run dijkstra");
while let Some(Entry { cost, page_idx }) = queue.pop() {
if page_idx == to {
// We've found the shortest path to our target
break;
println!();
print_path(&data, start, goal, path);
}
let page = data.page(page_idx);
if cost > page.data.cost {
// This queue entry is outdated
continue;
}
let redirect = page.data.redirect;
for link_idx in data.link_range(page_idx) {
let link = data.link(link_idx);
let next = Entry {
cost: cost + if redirect { 0 } else { link.data.cost },
page_idx: link.to,
};
let target_page = data.page_mut(link.to);
if next.cost < target_page.data.cost {
target_page.data.cost = next.cost;
target_page.data.prev = page_idx;
queue.push(next);
}
}
}
println!("> Collect results");
let mut steps = vec![];
let mut at = to;
loop {
steps.push(at);
at = data.page(at).data.prev;
if at == u32::MAX {
break;
};
}
steps.reverse();
if steps.first() == Some(&from) {
Some(steps)
} else {
None
Ok(())
}
}
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
println!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?);
let data = store::read_adjacency_list(&mut databuf)?;
let pages = data.pages.clone();
println!(">> Locate from and to");
let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from));
let to_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, to));
println!("From: {:?}", data.page(from_idx).data.title);
println!("To: {:?}", data.page(to_idx).data.title);
println!(">> Find path");
let path = dijkstra(data, from_idx, to_idx);
if let Some(path) = path {
println!("Path found:");
for page_idx in path {
let page = &pages[page_idx as usize];
if page.data.redirect {
println!(" v {:?}", page.data.title);
} else {
println!(" - {:?}", page.data.title);
}
}
} else {
println!("No path found");
}
Ok(())
}

273
brood/src/commands/pg.rs Normal file
View file

@ -0,0 +1,273 @@
use std::{
collections::{BTreeSet, HashMap, HashSet},
io::{self, BufWriter},
};
use crate::{
data::Data,
graph::NodeIdx,
util::{self, TitleNormalizer},
};
struct PageMap(Vec<NodeIdx>);
impl PageMap {
fn new(len: usize) -> Self {
Self(vec![NodeIdx::NONE; len])
}
fn get(&self, node: NodeIdx) -> NodeIdx {
self.0[node.usize()]
}
fn set(&mut self, node: NodeIdx, to: NodeIdx) {
self.0[node.usize()] = to;
}
}
fn first_viable_link(data: &Data, node: NodeIdx) -> Option<NodeIdx> {
for edge in data.graph.edge_slice(node) {
let link = &data.links[edge.usize()];
if !link.in_parens() && !link.in_structure() {
return Some(*edge);
}
}
None
}
fn find_forward_edges(data: &Data) -> PageMap {
let mut result = PageMap::new(data.pages.len());
for node in data.graph.nodes() {
if let Some(first_link) = first_viable_link(data, node) {
result.set(node, first_link);
}
}
result
}
fn find_clusters(data: &Data, forward: &PageMap) -> PageMap {
let mut cluster = PageMap::new(data.pages.len());
for node in data.graph.nodes() {
let mut current = node;
let mut visited = HashSet::new();
let canonical = loop {
// We've already determined the canonical element for this page.
if cluster.get(current) != NodeIdx::NONE {
break cluster.get(current);
}
// We've hit a loop
if visited.contains(&current) {
let mut loop_members = BTreeSet::new();
while !loop_members.contains(&current) {
loop_members.insert(current);
current = forward.get(current);
}
break loop_members.pop_first().unwrap();
}
visited.insert(current);
let next = forward.get(current);
if next == NodeIdx::NONE {
// We've hit a dead-end
break current;
}
current = next;
};
for i in visited {
cluster.set(i, canonical);
}
}
cluster
}
enum Cluster {
DeadEnd(NodeIdx),
Loop(Vec<NodeIdx>),
}
fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap<NodeIdx, Cluster> {
let mut result = HashMap::new();
for canonical in cluster.0.iter().copied().collect::<HashSet<_>>() {
if forward.get(canonical) == NodeIdx::NONE {
result.insert(canonical, Cluster::DeadEnd(canonical));
continue;
}
let mut members = vec![];
let mut current = canonical;
loop {
members.push(current);
current = forward.get(current);
if current == canonical {
break;
}
}
result.insert(canonical, Cluster::Loop(members));
}
result
}
fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> {
let map = forward
.0
.iter()
.enumerate()
.map(|(node, first_link)| {
let page_title = &data.pages[node].title;
let first_link_title = if *first_link == NodeIdx::NONE {
None
} else {
Some(&data.pages[first_link.usize()].title)
};
(page_title, first_link_title)
})
.collect::<HashMap<_, _>>();
let writer = BufWriter::new(io::stdout());
serde_json::to_writer_pretty(writer, &map)?;
Ok(())
}
fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) {
let start_idx = util::resolve_title(normalizer, data, start);
let mut current = start_idx;
let mut visited = HashSet::new();
loop {
let page = &data.pages[current.usize()];
let title = &page.title;
if page.redirect {
println!(" v {title}");
} else {
println!(" - {title}");
}
visited.insert(current);
let next = forward.get(current);
if next == NodeIdx::NONE {
println!("> dead-end reached");
return;
}
if visited.contains(&next) {
let page = &data.pages[next.usize()];
let title = &page.title;
println!("> loop detected ({title})");
return;
}
current = next;
}
}
fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> {
let map = cluster
.0
.iter()
.enumerate()
.map(|(page, canonical)| {
(
&data.pages[page].title,
&data.pages[canonical.usize()].title,
)
})
.collect::<HashMap<_, _>>();
let writer = BufWriter::new(io::stdout());
serde_json::to_writer_pretty(writer, &map)?;
Ok(())
}
#[derive(Debug, PartialEq, Eq, clap::Parser)]
enum Command {
First,
Trace { start: String },
Canonical,
Cluster,
}
/// Show interesting stats.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
#[command(subcommand)]
command: Command,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new();
eprintln!(">> Forward");
let forward = find_forward_edges(&data);
match self.command {
Command::First => {
eprintln!(">> First links");
print_forward_edges_as_json(&data, &forward)?;
return Ok(());
}
Command::Trace { start } => {
eprintln!(">> Tracing");
print_trace(&normalizer, &data, &forward, &start);
return Ok(());
}
_ => {}
}
// Determine cluster for each page, represented via canonical page. The
// canonical page of a cluster is either a dead-end or the loop member with
// the smallest index.
eprintln!(">> Find clusters");
let cluster = find_clusters(&data, &forward);
if self.command == Command::Canonical {
print_canonical_pages_as_json(&data, &cluster)?;
return Ok(());
}
// Measure cluster size
eprintln!(">> Measure clusters");
let mut cluster_size = HashMap::<NodeIdx, u32>::new();
for (i, canonical) in cluster.0.iter().enumerate() {
assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title);
*cluster_size.entry(*canonical).or_default() += 1;
}
let mut cluster_by_size = cluster_size.into_iter().collect::<Vec<_>>();
cluster_by_size.sort_by_key(|(c, s)| (*s, *c));
cluster_by_size.reverse();
// Print clusters
assert!(self.command == Command::Cluster);
let resolved = resolve_clusters(&forward, &cluster);
for (canonical, size) in cluster_by_size {
match resolved.get(&canonical).unwrap() {
Cluster::DeadEnd(page) => {
let title = &data.pages[page.usize()].title;
println!("Cluster (dead-end, {size}): {title}");
}
Cluster::Loop(pages) => {
println!("Cluster ({}-loop, {size}):", pages.len());
for page in pages {
let page = &data.pages[page.usize()];
let title = &page.title;
if page.redirect {
println!(" v {title}");
} else {
println!(" - {title}");
}
}
}
}
}
Ok(())
}
}

View file

@ -1,267 +0,0 @@
use std::{
collections::{BTreeSet, HashMap, HashSet},
fs::File,
io::{self, BufReader, BufWriter},
path::Path,
};
use crate::{
data::{
adjacency_list::AdjacencyList,
info::{LinkInfo, PageInfo},
store,
},
util, PhilosophyGameCmd,
};
struct PageMap(Vec<u32>);
impl PageMap {
fn new(len: usize) -> Self {
Self(vec![u32::MAX; len])
}
fn get(&self, page_idx: u32) -> u32 {
self.0[page_idx as usize]
}
fn set(&mut self, page_idx: u32, to: u32) {
self.0[page_idx as usize] = to;
}
}
fn first_viable_link(data: &AdjacencyList<PageInfo, LinkInfo>, page_idx: u32) -> Option<u32> {
for link_idx in data.link_range(page_idx) {
let link = data.link(link_idx);
if !link.data.in_parens() && !link.data.in_structure() {
return Some(link.to);
}
}
None
}
fn find_forward_edges(data: &AdjacencyList<PageInfo, LinkInfo>) -> PageMap {
let mut result = PageMap::new(data.pages.len());
for (page_idx, _) in data.pages() {
if let Some(first_link) = first_viable_link(data, page_idx) {
result.set(page_idx, first_link);
}
}
result
}
fn find_clusters(data: &AdjacencyList<PageInfo, LinkInfo>, forward: &PageMap) -> PageMap {
let mut cluster = PageMap::new(data.pages.len());
for (page_idx, _) in data.pages() {
let mut current = page_idx;
let mut visited = HashSet::new();
let canonical = loop {
// We've already determined the canonical element for this page.
if cluster.get(current) != u32::MAX {
break cluster.get(current);
}
// We've hit a loop
if visited.contains(&current) {
let mut loop_members = BTreeSet::new();
while !loop_members.contains(&current) {
loop_members.insert(current);
current = forward.get(current);
}
break loop_members.pop_first().unwrap();
}
visited.insert(current);
let next = forward.get(current);
if next == u32::MAX {
// We've hit a dead-end
break current;
}
current = next;
};
for i in visited {
cluster.set(i, canonical);
}
}
cluster
}
enum Cluster {
DeadEnd(u32),
Loop(Vec<u32>),
}
fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap<u32, Cluster> {
let mut result = HashMap::new();
for canonical in cluster.0.iter().copied().collect::<HashSet<_>>() {
if forward.get(canonical) == u32::MAX {
result.insert(canonical, Cluster::DeadEnd(canonical));
continue;
}
let mut members = vec![];
let mut current = canonical;
loop {
members.push(current);
current = forward.get(current);
if current == canonical {
break;
}
}
result.insert(canonical, Cluster::Loop(members));
}
result
}
fn print_forward_edges_as_json(
data: &AdjacencyList<PageInfo, LinkInfo>,
forward: &PageMap,
) -> io::Result<()> {
let map = forward
.0
.iter()
.enumerate()
.map(|(page, first_link)| {
let page_title = &data.page(page as u32).data.title;
let first_link_title = if *first_link == u32::MAX {
None
} else {
Some(&data.page(*first_link).data.title)
};
(page_title, first_link_title)
})
.collect::<HashMap<_, _>>();
let writer = BufWriter::new(io::stdout());
serde_json::to_writer_pretty(writer, &map)?;
Ok(())
}
fn print_trace(data: &AdjacencyList<PageInfo, LinkInfo>, forward: &PageMap, start: &str) {
let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start));
let mut current = start_idx;
let mut visited = HashSet::new();
loop {
let page = data.page(current);
let title = &page.data.title;
if page.data.redirect {
println!(" v {title}");
} else {
println!(" - {title}");
}
visited.insert(current);
let next = forward.get(current);
if next == u32::MAX {
println!("dead-end reached");
return;
}
if visited.contains(&next) {
println!("loop detected");
return;
}
current = next;
}
}
fn print_canonical_pages_as_json(
data: &AdjacencyList<PageInfo, LinkInfo>,
cluster: &PageMap,
) -> io::Result<()> {
let map = cluster
.0
.iter()
.enumerate()
.map(|(page, canonical)| {
(
&data.page(page as u32).data.title,
&data.page(*canonical).data.title,
)
})
.collect::<HashMap<_, _>>();
let writer = BufWriter::new(io::stdout());
serde_json::to_writer_pretty(writer, &map)?;
Ok(())
}
pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> {
eprintln!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?);
let data = store::read_adjacency_list(&mut databuf)?;
eprintln!(">> Forward");
let forward = find_forward_edges(&data);
match subcmd {
PhilosophyGameCmd::First => {
eprintln!(">> First links");
print_forward_edges_as_json(&data, &forward)?;
return Ok(());
}
PhilosophyGameCmd::Trace { start } => {
eprintln!(">> Tracing");
print_trace(&data, &forward, &start);
return Ok(());
}
_ => {}
}
// Determine cluster for each page, represented via canonical page. The
// canonical page of a cluster is either a dead-end or the loop member with
// the smallest index.
eprintln!(">> Find clusters");
let cluster = find_clusters(&data, &forward);
if subcmd == PhilosophyGameCmd::Canonical {
print_canonical_pages_as_json(&data, &cluster)?;
return Ok(());
}
// Measure cluster size
eprintln!(">> Measure clusters");
let mut cluster_size = HashMap::<u32, u32>::new();
for (i, canonical) in cluster.0.iter().enumerate() {
assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title);
*cluster_size.entry(*canonical).or_default() += 1;
}
let mut cluster_by_size = cluster_size.into_iter().collect::<Vec<_>>();
cluster_by_size.sort_by_key(|(c, s)| (*s, *c));
cluster_by_size.reverse();
// Print clusters
assert!(subcmd == PhilosophyGameCmd::Cluster);
let resolved = resolve_clusters(&forward, &cluster);
for (canonical, size) in cluster_by_size {
match resolved.get(&canonical).unwrap() {
Cluster::DeadEnd(page) => {
let title = &data.page(*page).data.title;
println!("Cluster (dead-end, {size}): {title}");
}
Cluster::Loop(pages) => {
println!("Cluster ({}-loop, {size}):", pages.len());
for page in pages {
let page = data.page(*page);
let title = &page.data.title;
if page.data.redirect {
println!(" v {title}");
} else {
println!(" - {title}");
}
}
}
}
}
Ok(())
}

View file

@ -1,48 +0,0 @@
use std::fs::File;
use std::io::{self, BufReader, BufWriter};
use std::path::Path;
use crate::data::adjacency_list::AdjacencyList;
use crate::data::store;
pub fn reexport(
from: &Path,
to: &Path,
in_parens: Option<bool>,
in_structure: Option<bool>,
) -> io::Result<()> {
eprintln!(">> Import");
let mut from = BufReader::new(File::open(from)?);
let mut data = store::read_adjacency_list(&mut from)?;
eprintln!(">> Consistency check");
data.check_consistency();
if in_parens.is_some() || in_structure.is_some() {
eprintln!(">> Filtering");
let mut data2 = AdjacencyList::default();
for (page_idx, page) in data.pages() {
data2.push_page(page.data.clone());
for (_, link) in data.links(page_idx) {
if in_parens.is_some_and(|v| v != link.data.in_parens()) {
continue;
}
if in_structure.is_some_and(|v| v != link.data.in_structure()) {
continue;
}
data2.push_link(link.to, link.data);
}
}
data = data2;
}
eprintln!(">> Export");
let mut to = BufWriter::new(File::create(to)?);
store::write_adjacency_list(&data, &mut to)?;
Ok(())
}

151
brood/src/commands/show.rs Normal file
View file

@ -0,0 +1,151 @@
use std::{collections::HashSet, io};
use thousands::Separable;
use crate::{
data::Data,
util::{self, TitleNormalizer},
};
/// Show info about a specific article.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
title: String,
/// Print links in more detail.
#[arg(long, short)]
links: bool,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new();
println!(">> Locate article");
let mut node = util::locate_title(&normalizer, &data, &self.title);
loop {
let page = &data.pages[node.usize()];
const W_LABEL: usize = 12;
const W_NUM: usize = 11;
println!();
println!("{:>W_LABEL$}: {}", "Title", page.title);
println!(
"{:>W_LABEL$}: {}",
"Title (norm)",
normalizer.normalize(&page.title)
);
println!("{:>W_LABEL$}: {}", "Redirect", page.redirect);
println!("{:>W_LABEL$}: {:>W_NUM$}", "ID", page.id);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Length",
page.length.separate_with_underscores()
);
let outlinks = data.graph.edge_slice(node).to_vec();
let inlinks = data
.graph
.edges()
.filter(|(_, target)| *target == node)
.map(|(source, _)| source)
.collect::<Vec<_>>();
let outlinks_set = outlinks.iter().copied().collect::<HashSet<_>>();
let inlinks_set = inlinks.iter().copied().collect::<HashSet<_>>();
let twins_set = outlinks_set
.intersection(&inlinks_set)
.copied()
.collect::<HashSet<_>>();
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Links (out)",
outlinks.len().separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"unique",
outlinks_set.len().separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Links (in)",
inlinks.len().separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"unique",
inlinks_set.len().separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Twins",
twins_set.len().separate_with_underscores()
);
if self.links {
let mut twin_pages = twins_set
.iter()
.map(|n| &data.pages[n.usize()])
.collect::<Vec<_>>();
let mut outlink_only_pages = outlinks_set
.difference(&twins_set)
.map(|n| &data.pages[n.usize()])
.collect::<Vec<_>>();
let mut inlink_only_pages = inlinks_set
.difference(&twins_set)
.map(|n| &data.pages[n.usize()])
.collect::<Vec<_>>();
twin_pages.sort_by_key(|p| &p.title);
outlink_only_pages.sort_by_key(|p| &p.title);
inlink_only_pages.sort_by_key(|p| &p.title);
println!();
println!("Twins ({}):", twin_pages.len().separate_with_underscores());
for page in twin_pages {
println!("{}", util::fmt_page(page));
}
println!();
println!(
"Only outlinks ({}):",
outlink_only_pages.len().separate_with_underscores()
);
for page in outlink_only_pages {
println!("{}", util::fmt_page(page));
}
println!();
println!(
"Only inlinks ({}):",
inlink_only_pages.len().separate_with_underscores()
);
for page in inlink_only_pages {
println!("{}", util::fmt_page(page));
}
}
node = match data.redirect_target(node) {
Some(target) => target,
None => break,
};
}
Ok(())
}
}

View file

@ -0,0 +1,98 @@
mod degrees;
mod redirects;
use std::io;
use thousands::Separable;
use crate::data::Data;
#[derive(Debug, clap::Parser)]
enum Command {
Degrees(degrees::Cmd),
Redirects(redirects::Cmd),
}
/// Show interesting stats.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
#[command(subcommand)]
command: Option<Command>,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
if let Some(cmd) = self.command {
return match cmd {
Command::Degrees(cmd) => cmd.run(data),
Command::Redirects(cmd) => cmd.run(data),
};
}
println!();
const W_LABEL: usize = 14;
const W_NUM: usize = 11;
let n_pages = data.pages.len();
let n_redirects = data.pages.iter().filter(|p| p.redirect).count();
let n_articles = n_pages - n_redirects;
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Pages",
n_pages.separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Articles",
n_articles.separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Redirects",
n_redirects.separate_with_underscores()
);
println!();
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"Links",
data.links.len().separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"in parens",
data.links
.iter()
.filter(|l| l.in_parens())
.count()
.separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"in structures",
data.links
.iter()
.filter(|l| l.in_structure())
.count()
.separate_with_underscores()
);
println!(
"{:>W_LABEL$}: {:>W_NUM$}",
"pg eligible",
data.links
.iter()
.filter(|l| !l.in_parens() && !l.in_structure())
.count()
.separate_with_underscores()
);
Ok(())
}
}

View file

@ -0,0 +1,92 @@
use std::{cmp::Reverse, io};
use thousands::Separable;
use crate::{
algo,
data::{Data, Page},
util,
};
/// Show stats on article in- and out-degrees.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
#[arg(long, short, default_value_t = 5)]
top: usize,
}
impl Cmd {
pub fn run(self, mut data: Data) -> io::Result<()> {
println!(">> Outdegree");
println!("> Counting links");
let mut outdegree = vec![usize::MAX; data.pages.len()];
for node in data.graph.nodes() {
outdegree[node.usize()] = data.graph.edge_range(node).len();
}
println!(">> Indegree");
println!("> Inverting edges");
algo::invert(&mut data);
let mut indegree = vec![usize::MAX; data.pages.len()];
println!("> Counting links");
for node in data.graph.nodes() {
indegree[node.usize()] = data.graph.edge_range(node).len();
}
let mut by_degrees = data
.pages
.iter()
.zip(outdegree)
.zip(indegree)
.map(|((p, od), id)| (p, od, id))
.collect::<Vec<_>>();
println!();
println!("Most outlinks");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by_key(|(_, od, _)| Reverse(*od));
self.print_links(&by_degrees);
println!();
println!("Most inlinks");
println!("¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by_key(|(_, _, id)| Reverse(*id));
self.print_links(&by_degrees);
by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0);
println!();
println!("Most outlinks per non-zero inlink");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| {
let r1 = *od1 as f32 / *id1 as f32;
let r2 = *od2 as f32 / *id2 as f32;
r2.total_cmp(&r1) // Reverse order so max values are at beginnibg
});
self.print_links(&by_degrees);
println!();
println!("Most inlinks per non-zero outlink");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.reverse();
self.print_links(&by_degrees);
Ok(())
}
fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) {
for (i, (page, od, id)) in by_degrees.iter().take(self.top).enumerate() {
println!(
"{:3}. {} ({} out, {} in)",
i + 1,
util::fmt_page(page),
od.separate_with_underscores(),
id.separate_with_underscores()
);
}
}
}

View file

@ -0,0 +1,107 @@
use std::{cmp::Reverse, collections::HashSet, io};
use thousands::Separable;
use crate::{data::Data, graph::NodeIdx, util};
fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> {
let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new();
for node in data.graph.nodes() {
if !data.pages[node.usize()].redirect {
continue;
}
let mut seen = HashSet::new();
let mut curr = node;
seen.insert(node);
while let Some(next) = data.redirect_target(curr) {
if seen.contains(&next) {
println!(" Redirect loop: {}", data.pages[node.usize()].title);
break;
}
curr = next;
seen.insert(next);
}
redirects.push((node, curr, seen.len() - 1));
}
redirects
}
fn follow_redirect(data: &Data, start: NodeIdx) -> Vec<NodeIdx> {
let mut seen = HashSet::new();
let mut nodes = Vec::new();
let mut curr = start;
seen.insert(curr);
nodes.push(curr);
while let Some(next) = data.redirect_target(curr) {
if seen.contains(&next) {
break;
}
curr = next;
seen.insert(curr);
nodes.push(curr);
}
nodes
}
/// Show redirect stats.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
/// Show more detailed info.
#[arg(long, short)]
long: bool,
}
impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> {
println!(">> Resolve redirects");
let redirects = find_redirects(&data);
println!(
"There is a total of {} redirects.",
redirects.len().separate_with_underscores()
);
let mut long = redirects
.iter()
.filter(|(_, _, l)| *l > 1)
.collect::<Vec<_>>();
long.sort_by_key(|(_, _, l)| Reverse(l));
println!(
"{} redirects take more than one step to reach an article.",
long.len().separate_with_underscores()
);
println!(
"The longest redirect chain takes {} steps.",
long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0),
);
println!("Though these redirect chains are usually swiftly fixed by bots.");
if self.long {
println!();
println!("Redirect chains with length > 1:");
for (start, _, _) in long {
println!();
for step in follow_redirect(&data, *start) {
println!("{}", util::fmt_page(&data.pages[step.usize()]));
}
}
}
Ok(())
}
}

View file

@ -1,3 +1,218 @@
pub mod adjacency_list;
pub mod info;
pub mod store;
use std::{
fs::File,
io::{self, BufReader, BufWriter, Read, Write},
path::Path,
};
use crate::graph::{EdgeIdx, Graph, NodeIdx};
#[derive(Debug, Clone)]
pub struct Page {
pub id: u32,
pub title: String,
pub length: u32,
pub redirect: bool,
}
#[derive(Debug, Default, Clone, Copy)]
pub struct Link {
pub start: u32,
pub len: u32,
pub flags: u8,
}
impl Link {
pub fn in_parens(self) -> bool {
self.flags & 0b1 != 0
}
pub fn in_structure(self) -> bool {
self.flags & 0b10 != 0
}
}
fn write_u8(w: &mut impl Write, n: u8) -> io::Result<()> {
w.write_all(&n.to_le_bytes())
}
fn read_u8(r: &mut impl Read) -> io::Result<u8> {
let mut buf = [0_u8; 1];
r.read_exact(&mut buf)?;
Ok(u8::from_le_bytes(buf))
}
fn write_u16(w: &mut impl Write, n: u16) -> io::Result<()> {
w.write_all(&n.to_le_bytes())
}
fn read_u16(r: &mut impl Read) -> io::Result<u16> {
let mut buf = [0_u8; 2];
r.read_exact(&mut buf)?;
Ok(u16::from_le_bytes(buf))
}
fn write_u32(w: &mut impl Write, n: u32) -> io::Result<()> {
w.write_all(&n.to_le_bytes())
}
fn read_u32(r: &mut impl Read) -> io::Result<u32> {
let mut buf = [0_u8; 4];
r.read_exact(&mut buf)?;
Ok(u32::from_le_bytes(buf))
}
fn write_str(w: &mut impl Write, s: &str) -> io::Result<()> {
assert!(s.len() <= u16::MAX as usize);
write_u16(w, s.len() as u16)?;
w.write_all(s.as_bytes())?;
Ok(())
}
fn read_str(r: &mut impl Read) -> io::Result<String> {
let len = read_u16(r)? as usize;
let mut buf = vec![0_u8; len];
r.read_exact(&mut buf)?;
Ok(String::from_utf8(buf).unwrap())
}
fn write_page(w: &mut impl Write, page: &Page) -> io::Result<()> {
write_u32(w, page.id)?;
write_u32(w, page.length)?;
write_u8(w, if page.redirect { 1 } else { 0 })?;
write_str(w, &page.title)?;
Ok(())
}
pub fn read_page(r: &mut impl Read) -> io::Result<Page> {
Ok(Page {
id: read_u32(r)?,
length: read_u32(r)?,
redirect: read_u8(r)? != 0,
title: read_str(r)?,
})
}
fn write_link(w: &mut impl Write, link: &Link) -> io::Result<()> {
write_u32(w, link.start)?;
write_u32(w, link.len)?;
write_u8(w, link.flags)?;
Ok(())
}
fn read_link(r: &mut impl Read) -> io::Result<Link> {
Ok(Link {
start: read_u32(r)?,
len: read_u32(r)?,
flags: read_u8(r)?,
})
}
#[derive(Default)]
pub struct Data {
pub pages: Vec<Page>,
pub links: Vec<Link>,
pub graph: Graph,
}
impl Data {
pub fn new() -> Self {
Self::default()
}
pub fn with_capacity(pages: usize, links: usize) -> Self {
Self {
pages: Vec::with_capacity(pages),
links: Vec::with_capacity(links),
graph: Graph::with_capacity(pages, links),
}
}
fn write(&self, w: &mut impl Write) -> io::Result<()> {
assert!(self.pages.len() < u32::MAX as usize);
assert!(self.links.len() < u32::MAX as usize);
assert_eq!(self.pages.len(), self.graph.nodes.len());
assert_eq!(self.links.len(), self.graph.edges.len());
write_u32(w, self.pages.len() as u32)?;
write_u32(w, self.links.len() as u32)?;
for page in &self.pages {
write_page(w, page)?;
}
for link in &self.links {
write_link(w, link)?;
}
for node in &self.graph.nodes {
write_u32(w, node.0)?;
}
for edge in &self.graph.edges {
write_u32(w, edge.0)?;
}
Ok(())
}
fn read(r: &mut impl Read) -> io::Result<Self> {
let n_pages = read_u32(r)?;
let n_links = read_u32(r)?;
let mut result = Self::with_capacity(n_pages as usize, n_links as usize);
for _ in 0..n_pages {
result.pages.push(read_page(r)?);
}
for _ in 0..n_links {
result.links.push(read_link(r)?);
}
for _ in 0..n_pages {
result.graph.nodes.push(EdgeIdx(read_u32(r)?));
}
for _ in 0..n_links {
result.graph.edges.push(NodeIdx(read_u32(r)?));
}
assert_eq!(result.pages.len(), result.graph.nodes.len());
assert_eq!(result.links.len(), result.graph.edges.len());
result.graph.check_consistency();
Ok(result)
}
pub fn write_to_file(&self, path: &Path) -> io::Result<()> {
let mut file = BufWriter::new(File::create(path)?);
self.write(&mut file)
}
pub fn read_from_file(path: &Path) -> io::Result<Self> {
let mut file = BufReader::new(File::open(path)?);
Self::read(&mut file)
}
pub fn check_consistency(&self) {
assert_eq!(
self.pages.len(),
self.graph.nodes.len(),
"inconsistent number of pages"
);
assert_eq!(
self.links.len(),
self.graph.edges.len(),
"inconsistent number of links"
);
self.graph.check_consistency();
}
pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> {
if !self.pages[node.usize()].redirect {
return None;
}
self.graph.edge_slice(node).first().copied()
}
}

View file

@ -1,160 +0,0 @@
use std::ops::Range;
use super::info::{LinkInfo, PageInfo};
#[derive(Debug, Clone, Copy)]
pub struct Page<P> {
/// Index of the first link belonging to this page.
pub start: u32,
pub data: P,
}
impl<P> Page<P> {
pub fn change_data<P2>(self, f: impl Fn(P) -> P2) -> Page<P2> {
Page {
start: self.start,
data: f(self.data),
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct Link<L> {
/// Index of the page this link points to.
pub to: u32,
pub data: L,
}
impl<L> Link<L> {
pub fn change_data<L2>(self, f: impl Fn(L) -> L2) -> Link<L2> {
Link {
to: self.to,
data: f(self.data),
}
}
}
pub struct AdjacencyList<P, L> {
pub pages: Vec<Page<P>>,
pub links: Vec<Link<L>>,
}
impl<P, L> Default for AdjacencyList<P, L> {
fn default() -> Self {
Self {
pages: Default::default(),
links: Default::default(),
}
}
}
impl<P, L> AdjacencyList<P, L> {
pub fn push_page(&mut self, data: P) {
self.pages.push(Page {
start: self.links.len() as u32,
data,
});
}
pub fn push_link(&mut self, to: u32, data: L) {
self.links.push(Link { to, data })
}
pub fn page(&self, page_idx: u32) -> &Page<P> {
&self.pages[page_idx as usize]
}
pub fn page_mut(&mut self, page_idx: u32) -> &mut Page<P> {
&mut self.pages[page_idx as usize]
}
pub fn pages(&self) -> impl Iterator<Item = (u32, &Page<P>)> {
self.pages.iter().enumerate().map(|(i, p)| (i as u32, p))
}
pub fn link(&self, link_idx: u32) -> &Link<L> {
&self.links[link_idx as usize]
}
pub fn link_mut(&mut self, link_idx: u32) -> &mut Link<L> {
&mut self.links[link_idx as usize]
}
pub fn link_range(&self, page_idx: u32) -> Range<u32> {
let start_idx = self.pages[page_idx as usize].start;
let end_idx = match self.pages.get(page_idx as usize + 1) {
Some(page) => page.start,
None => self.links.len() as u32,
};
start_idx..end_idx
}
pub fn link_redirect(&self, page_idx: u32) -> Option<u32> {
let range = self.link_range(page_idx);
if range.is_empty() {
None
} else {
Some(range.start)
}
}
pub fn links(&self, page_idx: u32) -> impl Iterator<Item = (u32, &Link<L>)> {
self.link_range(page_idx).map(|i| (i, self.link(i)))
}
pub fn change_page_data<P2>(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList<P2, L> {
let pages = self
.pages
.into_iter()
.map(|p| p.change_data(page_f))
.collect::<Vec<_>>();
AdjacencyList {
pages,
links: self.links,
}
}
pub fn change_link_data<L2>(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList<P, L2> {
let links = self
.links
.into_iter()
.map(|l| l.change_data(link_f))
.collect::<Vec<_>>();
AdjacencyList {
pages: self.pages,
links,
}
}
}
impl AdjacencyList<PageInfo, LinkInfo> {
pub fn check_consistency(&self) {
// Check that all types are large enough
assert!(self.pages.len() < u32::MAX as usize, "too many pages");
assert!(self.links.len() < u32::MAX as usize, "too many links");
for page in &self.pages {
assert!(
page.data.title.len() <= u8::MAX as usize,
"page title too long"
);
}
// Check that all links contain valid indices. Links must not link to
// the sentinel page.
let range = 0..self.pages.len() as u32;
for link in &self.links {
assert!(range.contains(&link.to), "invalid link");
}
// Check that all redirect pages have at most one link
for (page_idx, page) in self.pages.iter().enumerate() {
if page.data.redirect {
let range = self.link_range(page_idx as u32);
let amount = range.end - range.start;
assert!(amount <= 1, "too many redirect links");
}
}
}
}

View file

@ -1,24 +0,0 @@
#[derive(Debug, Clone)]
pub struct PageInfo {
pub id: u32,
pub title: String,
pub length: u32,
pub redirect: bool,
}
#[derive(Debug, Default, Clone, Copy)]
pub struct LinkInfo {
pub start: u32,
pub len: u32,
pub flags: u8,
}
impl LinkInfo {
pub fn in_parens(self) -> bool {
self.flags & 0b1 != 0
}
pub fn in_structure(self) -> bool {
self.flags & 0b10 != 0
}
}

View file

@ -1,134 +0,0 @@
use std::io::{self, Read, Write};
use super::{
adjacency_list::{AdjacencyList, Link, Page},
info::{LinkInfo, PageInfo},
};
fn write_u8<W: Write>(n: u8, to: &mut W) -> io::Result<()> {
to.write_all(&n.to_le_bytes())
}
fn read_u8<R: Read>(from: &mut R) -> io::Result<u8> {
let mut buf = [0_u8; 1];
from.read_exact(&mut buf)?;
Ok(u8::from_le_bytes(buf))
}
fn write_u16<W: Write>(n: u16, to: &mut W) -> io::Result<()> {
to.write_all(&n.to_le_bytes())
}
fn read_u16<R: Read>(from: &mut R) -> io::Result<u16> {
let mut buf = [0_u8; 2];
from.read_exact(&mut buf)?;
Ok(u16::from_le_bytes(buf))
}
fn write_u32<W: Write>(n: u32, to: &mut W) -> io::Result<()> {
to.write_all(&n.to_le_bytes())
}
fn read_u32<R: Read>(from: &mut R) -> io::Result<u32> {
let mut buf = [0_u8; 4];
from.read_exact(&mut buf)?;
Ok(u32::from_le_bytes(buf))
}
fn write_str<W: Write>(s: &str, to: &mut W) -> io::Result<()> {
assert!(s.len() <= u16::MAX as usize);
write_u16(s.len() as u16, to)?;
to.write_all(s.as_bytes())?;
Ok(())
}
fn read_str<R: Read>(from: &mut R) -> io::Result<String> {
let len = read_u16(from)? as usize;
let mut buf = vec![0_u8; len];
from.read_exact(&mut buf)?;
Ok(String::from_utf8(buf).unwrap())
}
fn write_page<W: Write>(page: &Page<PageInfo>, to: &mut W) -> io::Result<()> {
write_u32(page.start, to)?;
write_u32(page.data.id, to)?;
write_u32(page.data.length, to)?;
write_u8(if page.data.redirect { 1 } else { 0 }, to)?;
write_str(&page.data.title, to)?;
Ok(())
}
pub fn read_page<R: Read>(from: &mut R) -> io::Result<Page<PageInfo>> {
let start_link_idx = read_u32(from)?;
let id = read_u32(from)?;
let length = read_u32(from)?;
let redirect = read_u8(from)? != 0;
let title = read_str(from)?;
Ok(Page {
start: start_link_idx,
data: PageInfo {
id,
length,
redirect,
title,
},
})
}
fn write_link<W: Write>(link: &Link<LinkInfo>, to: &mut W) -> io::Result<()> {
write_u32(link.to, to)?;
write_u32(link.data.start, to)?;
write_u32(link.data.len, to)?;
write_u8(link.data.flags, to)?;
Ok(())
}
fn read_link<R: Read>(from: &mut R) -> io::Result<Link<LinkInfo>> {
let to_page_idx = read_u32(from)?;
let start = read_u32(from)?;
let len = read_u32(from)?;
let flags = read_u8(from)?;
Ok(Link {
to: to_page_idx,
data: LinkInfo { start, len, flags },
})
}
pub fn write_adjacency_list<W: Write>(
al: &AdjacencyList<PageInfo, LinkInfo>,
to: &mut W,
) -> io::Result<()> {
write_u32(al.pages.len() as u32, to)?;
write_u32(al.links.len() as u32, to)?;
for page in &al.pages {
write_page(page, to)?;
}
for link in &al.links {
write_link(link, to)?;
}
Ok(())
}
pub fn read_adjacency_list<R: Read>(from: &mut R) -> io::Result<AdjacencyList<PageInfo, LinkInfo>> {
let n_pages = read_u32(from)?;
let n_links = read_u32(from)?;
let mut pages = vec![];
for _ in 0..n_pages {
pages.push(read_page(from)?);
}
let mut links = vec![];
for _ in 0..n_links {
links.push(read_link(from)?);
}
Ok(AdjacencyList { pages, links })
}

295
brood/src/graph.rs Normal file
View file

@ -0,0 +1,295 @@
use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct NodeIdx(pub u32);
impl NodeIdx {
pub const NONE: Self = Self(u32::MAX);
#[inline]
pub const fn new(value: usize) -> Self {
Self(value as u32)
}
#[inline]
pub const fn usize(self) -> usize {
self.0 as usize
}
}
impl From<u32> for NodeIdx {
fn from(value: u32) -> Self {
Self(value)
}
}
impl From<usize> for NodeIdx {
fn from(value: usize) -> Self {
Self::new(value)
}
}
impl Add for NodeIdx {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self(self.0 + rhs.0)
}
}
impl AddAssign for NodeIdx {
fn add_assign(&mut self, rhs: Self) {
self.0 += rhs.0;
}
}
impl Sub for NodeIdx {
type Output = Self;
fn sub(self, rhs: Self) -> Self::Output {
Self(self.0 - rhs.0)
}
}
impl SubAssign for NodeIdx {
fn sub_assign(&mut self, rhs: Self) {
self.0 -= rhs.0;
}
}
impl Add<u32> for NodeIdx {
type Output = Self;
fn add(self, rhs: u32) -> Self::Output {
Self(self.0 + rhs)
}
}
impl AddAssign<u32> for NodeIdx {
fn add_assign(&mut self, rhs: u32) {
self.0 += rhs;
}
}
impl Sub<u32> for NodeIdx {
type Output = Self;
fn sub(self, rhs: u32) -> Self::Output {
Self(self.0 - rhs)
}
}
impl SubAssign<u32> for NodeIdx {
fn sub_assign(&mut self, rhs: u32) {
self.0 -= rhs;
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct EdgeIdx(pub u32);
impl EdgeIdx {
#[inline]
pub const fn new(value: usize) -> Self {
Self(value as u32)
}
#[inline]
pub const fn usize(self) -> usize {
self.0 as usize
}
}
impl From<u32> for EdgeIdx {
fn from(value: u32) -> Self {
Self(value)
}
}
impl From<usize> for EdgeIdx {
fn from(value: usize) -> Self {
Self::new(value)
}
}
impl Add for EdgeIdx {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self(self.0 + rhs.0)
}
}
impl AddAssign for EdgeIdx {
fn add_assign(&mut self, rhs: Self) {
self.0 += rhs.0;
}
}
impl Sub for EdgeIdx {
type Output = Self;
fn sub(self, rhs: Self) -> Self::Output {
Self(self.0 - rhs.0)
}
}
impl SubAssign for EdgeIdx {
fn sub_assign(&mut self, rhs: Self) {
self.0 -= rhs.0;
}
}
impl Add<u32> for EdgeIdx {
type Output = Self;
fn add(self, rhs: u32) -> Self::Output {
Self(self.0 + rhs)
}
}
impl AddAssign<u32> for EdgeIdx {
fn add_assign(&mut self, rhs: u32) {
self.0 += rhs;
}
}
impl Sub<u32> for EdgeIdx {
type Output = Self;
fn sub(self, rhs: u32) -> Self::Output {
Self(self.0 - rhs)
}
}
impl SubAssign<u32> for EdgeIdx {
fn sub_assign(&mut self, rhs: u32) {
self.0 -= rhs;
}
}
#[derive(Default)]
pub struct Graph {
/// A node points to the first of its edges.
///
/// A special case is that if the subsequent node points to the same edge,
/// the current node has no edges.
pub nodes: Vec<EdgeIdx>,
/// An edge points to a target node.
///
/// The source node is defined implicitly by the graph data structure.
pub edges: Vec<NodeIdx>,
}
impl Graph {
pub fn with_capacity(nodes: usize, edges: usize) -> Self {
Self {
nodes: Vec::with_capacity(nodes),
edges: Vec::with_capacity(edges),
}
}
pub fn add_node(&mut self) {
self.nodes.push(EdgeIdx::new(self.edges.len()));
}
pub fn add_edge(&mut self, target: NodeIdx) {
self.edges.push(target);
}
pub fn check_consistency(&self) {
if self.nodes.is_empty() {
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
return;
}
assert!(self.nodes.len() < u32::MAX as usize, "too many nodes");
assert!(self.edges.len() < u32::MAX as usize, "too many edges");
assert_eq!(
*self.nodes.first().unwrap(),
EdgeIdx(0),
"first node pointer must be 0"
);
for (ni, node) in self.nodes.iter().cloned().enumerate() {
assert!(
node.usize() <= self.edges.len(),
"node pointers must be in range"
);
if let Some(succ) = self.nodes.get(ni + 1) {
assert!(node <= *succ, "node pointers must be well-ordered");
}
}
for edge in &self.edges {
assert!(
edge.usize() < self.nodes.len(),
"edge pointers must be in range"
);
}
}
pub fn nodes(&self) -> impl Iterator<Item = NodeIdx> + '_ {
(0..self.nodes.len()).map(NodeIdx::new)
}
pub fn edges(&self) -> impl Iterator<Item = (NodeIdx, NodeIdx)> + '_ {
Edges::new(self)
}
pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx {
self.nodes
.get(node.usize())
.copied()
.unwrap_or_else(|| self.edges.len().into())
}
pub fn edge_range(&self, node: NodeIdx) -> Range<usize> {
let start = self.nodes[node.usize()];
let end = self.edge_start(node + 1);
start.usize()..end.usize()
}
pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] {
&self.edges[self.edge_range(node)]
}
}
struct Edges<'a> {
graph: &'a Graph,
ni: NodeIdx,
ei: EdgeIdx,
}
impl<'a> Edges<'a> {
fn new(graph: &'a Graph) -> Self {
Self {
graph,
ni: NodeIdx(0),
ei: EdgeIdx(0),
}
}
}
impl Iterator for Edges<'_> {
type Item = (NodeIdx, NodeIdx);
fn next(&mut self) -> Option<Self::Item> {
if self.ei.usize() >= self.graph.edges.len() {
return None;
}
let target = self.graph.edges[self.ei.usize()];
// if would not be sufficient because some nodes may not have any edges.
while self.ei >= self.graph.edge_start(self.ni + 1) {
self.ni += 1;
}
let source = self.ni;
self.ei += 1;
Some((source, target))
}
}

View file

@ -1,49 +1,23 @@
pub mod commands;
mod algo;
mod commands;
mod data;
mod graph;
mod util;
use std::io;
use std::path::PathBuf;
use std::{io, path::PathBuf};
use clap::Parser;
#[derive(Debug, PartialEq, Eq, Parser)]
pub enum PhilosophyGameCmd {
First,
Canonical,
Cluster,
Trace { start: String },
}
use data::Data;
#[derive(Debug, Parser)]
enum Command {
/// Read sift data on stdin and output brood data.
Ingest,
/// Read and reexport brood data.
Reexport {
to: PathBuf,
#[arg(long, short = 'P')]
in_parens: Option<bool>,
#[arg(long, short = 'S')]
in_structure: Option<bool>,
},
/// Find a path from one article to another.
Path {
from: String,
to: String,
/// Flip start and end article.
#[arg(short, long)]
flip: bool,
},
/// Find the longest shortest path starting at an article.
LongestShortestPath { from: String },
/// Analyze articles using "Philosophy Game" rules.
PhilosophyGame {
#[command(subcommand)]
subcmd: PhilosophyGameCmd,
},
/// Print all page titles.
ListPages,
Ingest(commands::ingest::Cmd),
Export(commands::export::Cmd),
Show(commands::show::Cmd),
Stats(commands::stats::Cmd),
Path(commands::path::Cmd),
LongestPath(commands::longest_path::Cmd),
Pg(commands::pg::Cmd),
}
#[derive(Debug, Parser)]
@ -51,30 +25,59 @@ struct Args {
datafile: PathBuf,
#[command(subcommand)]
command: Command,
#[arg(long, short = 'P')]
in_parens: Option<bool>,
#[arg(long, short = 'S')]
in_structure: Option<bool>,
#[arg(long, short = 'R')]
resolve_redirects: bool,
#[arg(long, short = 'I')]
invert_edges: bool,
#[arg(long, short)]
check_consistency: bool,
}
fn main() -> io::Result<()> {
let args = Args::parse();
if let Command::Ingest(cmd) = &args.command {
return cmd.run(&args.datafile);
}
println!(">> Import");
println!("> Reading data");
let mut data = Data::read_from_file(&args.datafile)?;
if args.in_parens.is_some() || args.in_structure.is_some() {
println!("> Filtering edges");
algo::retain_edges(&mut data, |link| {
args.in_parens.is_none_or(|b| b == link.in_parens())
&& args.in_structure.is_none_or(|b| b == link.in_structure())
});
}
if args.resolve_redirects {
println!("> Resolving redirects");
algo::resolve_redirects(&mut data);
}
if args.invert_edges {
println!("> Inverting edges");
algo::invert(&mut data);
}
if args.check_consistency {
println!("> Checking consistencey");
data.check_consistency();
}
match args.command {
Command::Ingest => commands::ingest::ingest(&args.datafile),
Command::Reexport {
to,
in_parens,
in_structure,
} => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure),
Command::Path { from, to, flip } => {
if flip {
commands::path::path(&args.datafile, &to, &from)
} else {
commands::path::path(&args.datafile, &from, &to)
}
}
Command::LongestShortestPath { from } => {
commands::longest_shortest_path::run(&args.datafile, &from)
}
Command::PhilosophyGame { subcmd } => {
commands::philosophy_game::run(&args.datafile, subcmd)
}
Command::ListPages => commands::list_pages::run(&args.datafile),
Command::Ingest(_) => unreachable!(),
Command::Export(cmd) => cmd.run(data),
Command::Show(cmd) => cmd.run(data),
Command::Stats(cmd) => cmd.run(data),
Command::Path(cmd) => cmd.run(data),
Command::LongestPath(cmd) => cmd.run(data),
Command::Pg(cmd) => cmd.run(data),
}
}

View file

@ -1,39 +1,160 @@
use crate::data::{
adjacency_list::{AdjacencyList, Page},
info::{LinkInfo, PageInfo},
use std::{collections::HashSet, fmt};
use regex::Regex;
use crate::{
data::{Data, Page},
graph::NodeIdx,
};
pub fn normalize_link(link: &str) -> String {
let link = link.trim().replace(' ', "_");
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/mediawiki.Title.phpCharToUpper.js
struct PhpCharToUpper(char);
// Make only first char lowercase
link.chars()
.next()
.iter()
.flat_map(|c| c.to_lowercase())
.chain(link.chars().skip(1))
.collect::<String>()
}
impl fmt::Display for PhpCharToUpper {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.0 {
// Do something special, I guess
'ᾀ' => write!(f, ""),
'ᾁ' => write!(f, ""),
'ᾂ' => write!(f, ""),
'ᾃ' => write!(f, ""),
'ᾄ' => write!(f, ""),
'ᾅ' => write!(f, ""),
'ᾆ' => write!(f, ""),
'ᾇ' => write!(f, ""),
'ᾐ' => write!(f, ""),
'ᾑ' => write!(f, ""),
'ᾒ' => write!(f, ""),
'ᾓ' => write!(f, ""),
'ᾔ' => write!(f, ""),
'ᾕ' => write!(f, ""),
'ᾖ' => write!(f, ""),
'ᾗ' => write!(f, ""),
'ᾠ' => write!(f, ""),
'ᾡ' => write!(f, ""),
'ᾢ' => write!(f, ""),
'ᾣ' => write!(f, ""),
'ᾤ' => write!(f, ""),
'ᾥ' => write!(f, ""),
'ᾦ' => write!(f, ""),
'ᾧ' => write!(f, ""),
'ᾳ' => write!(f, ""),
'ῃ' => write!(f, ""),
'ῳ' => write!(f, ""),
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> u32 {
let title = normalize_link(title);
pages
.iter()
.enumerate()
.find(|(_, p)| normalize_link(&p.data.title) == title)
.map(|(i, _)| i)
.expect("invalid title") as u32
}
pub fn resolve_redirects(data: &AdjacencyList<PageInfo, LinkInfo>, mut page_idx: u32) -> u32 {
loop {
if data.page(page_idx).data.redirect {
if let Some(link_idx) = data.link_redirect(page_idx) {
page_idx = data.link(link_idx).to;
continue;
// Do not capitalize
'ß' | 'ʼn' | 'ǰ' | 'ʂ' | 'ͅ' | 'ΐ' | 'ΰ' | 'և' | 'ა' | 'ბ' | 'გ' | 'დ' | 'ე' | 'ვ'
| 'ზ' | 'თ' | 'ი' | 'კ' | 'ლ' | 'მ' | 'ნ' | 'ო' | 'პ' | 'ჟ' | 'რ' | 'ს' | 'ტ' | 'უ'
| 'ფ' | 'ქ' | 'ღ' | '' | 'შ' | 'ჩ' | 'ც' | 'ძ' | 'წ' | 'ჭ' | 'ხ' | 'ჯ' | 'ჰ' | 'ჱ'
| 'ჲ' | 'ჳ' | 'ჴ' | 'ჵ' | 'ჶ' | 'ჷ' | 'ჸ' | 'ჹ' | 'ჺ' | 'ჽ' | 'ჾ' | '' | 'ᶎ' | 'ẖ'
| 'ẗ' | 'ẘ' | 'ẙ' | 'ẚ' | 'ὐ' | 'ὒ' | 'ὔ' | 'ὖ' | 'ᾈ' | 'ᾉ' | 'ᾊ' | 'ᾋ' | 'ᾌ' | 'ᾍ'
| 'ᾎ' | 'ᾏ' | 'ᾘ' | 'ᾙ' | 'ᾚ' | 'ᾛ' | 'ᾜ' | 'ᾝ' | 'ᾞ' | 'ᾟ' | 'ᾨ' | 'ᾩ' | 'ᾪ' | 'ᾫ'
| 'ᾬ' | 'ᾭ' | 'ᾮ' | 'ᾯ' | 'ᾲ' | 'ᾴ' | 'ᾶ' | 'ᾷ' | 'ᾼ' | 'ῂ' | 'ῄ' | 'ῆ' | 'ῇ' | 'ῌ'
| 'ῒ' | 'ΐ' | 'ῖ' | 'ῗ' | 'ῢ' | 'ΰ' | 'ῤ' | 'ῦ' | 'ῧ' | 'ῲ' | 'ῴ' | 'ῶ' | 'ῷ' | 'ῼ'
| '' | 'ⅱ' | 'ⅲ' | 'ⅳ' | '' | 'ⅵ' | 'ⅶ' | 'ⅷ' | 'ⅸ' | '' | 'ⅺ' | 'ⅻ' | '' | ''
| '' | 'ⅿ' | 'ⓐ' | 'ⓑ' | 'ⓒ' | 'ⓓ' | 'ⓔ' | 'ⓕ' | 'ⓖ' | 'ⓗ' | 'ⓘ' | 'ⓙ' | 'ⓚ' | 'ⓛ'
| 'ⓜ' | 'ⓝ' | 'ⓞ' | 'ⓟ' | 'ⓠ' | 'ⓡ' | 'ⓢ' | 'ⓣ' | 'ⓤ' | 'ⓥ' | 'ⓦ' | 'ⓧ' | 'ⓨ' | 'ⓩ'
| 'ꞔ' | 'ꞹ' | 'ꞻ' | 'ꞽ' | 'ꞿ' | 'ꟃ' | 'ff' | 'fi' | 'fl' | 'ffi' | 'ffl' | 'ſt' | 'st' | 'ﬓ'
| 'ﬔ' | 'ﬕ' | 'ﬖ' | 'ﬗ' | '𖹠' | '𖹡' | '𖹢' | '𖹣' | '𖹤' | '𖹥' | '𖹦' | '𖹧' | '𖹨' | '𖹩'
| '𖹪' | '𖹫' | '𖹬' | '𖹭' | '𖹮' | '𖹯' | '𖹰' | '𖹱' | '𖹲' | '𖹳' | '𖹴' | '𖹵' | '𖹶' | '𖹷'
| '𖹸' | '𖹹' | '𖹺' | '𖹻' | '𖹼' | '𖹽' | '𖹾' | '𖹿' => {
write!(f, "{}", self.0)
}
}
return page_idx;
// Capitalize normally
c => write!(f, "{}", c.to_uppercase()),
}
}
}
pub struct TitleNormalizer {
strip_bidi: Regex,
clean_up_whitespace: Regex,
trim_underscore_start: Regex,
trim_underscore_end: Regex,
}
impl TitleNormalizer {
pub fn new() -> Self {
Self {
strip_bidi: Regex::new("[\u{200E}\u{200F}\u{202A}-\u{202E}]").unwrap(),
clean_up_whitespace: Regex::new(concat!(
"[ _\u{00A0}\u{1680}\u{180E}\u{2000}-\u{200A}",
"\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+"
))
.unwrap(),
trim_underscore_start: Regex::new("^_+").unwrap(),
trim_underscore_end: Regex::new("_+$").unwrap(),
}
}
/// Normalize an article title.
///
/// See also <https://github.com/wikimedia/mediawiki-title>.
pub fn normalize(&self, title: &str) -> String {
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L403
// Strip Unicode bidi override characters
let title = self.strip_bidi.replace_all(title, "");
// Clean up whitespace
let title = self.clean_up_whitespace.replace_all(&title, "_");
// Trim _ from beginning and end
let title = self.trim_underscore_start.replace_all(&title, "");
let title = self.trim_underscore_end.replace_all(&title, "");
// https://github.com/wikimedia/mediawiki-title/blob/6880ae1a9ffdfa2eea9fd75b472493a67dabcc48/lib/index.js#L206
let Some(first) = title.chars().next() else {
return String::new();
};
let rest = &title[first.len_utf8()..];
format!("{}{rest}", PhpCharToUpper(first))
}
}
pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
let normalized = normalizer.normalize(title);
data.pages
.iter()
.enumerate()
.find(|(_, p)| normalizer.normalize(&p.title) == normalized)
.map(|(i, _)| NodeIdx::new(i))
.expect("invalid title")
}
pub fn resolve_redirects(data: &Data, node: NodeIdx) -> NodeIdx {
let mut curr = node;
let mut seen = HashSet::new();
seen.insert(curr);
while let Some(target) = data.redirect_target(curr) {
if seen.contains(&target) {
println!(
" Redirect cycle deteted: {:?}",
data.pages[node.usize()].title
);
break;
}
seen.insert(target);
curr = target;
}
curr
}
pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
resolve_redirects(data, locate_title(normalizer, data, title))
}
pub fn fmt_page(page: &Page) -> String {
if page.redirect {
format!("v {}", page.title)
} else {
format!("- {}", page.title)
}
}

View file

@ -172,16 +172,21 @@ def process_xmldump_page(page):
# Page info as simple tuples
def simple_pages(input):
dump = mwxml.Dump.from_file(sys.stdin)
articles = 0
for i, page in enumerate(dump.pages):
if (i + 1) % 1000 == 0:
# Yeah, the articles are usually off by one
eprint(f"{i+1:8} pages, {articles:8} articles, at pid {page.id:8}")
if page.namespace != 0:
continue
if (i + 1) % 1000 == 0:
eprint(f"{i+1:8} pages, at pid {page.id:8}")
articles += 1
[revision] = list(page) # Every page has exactly one revision
yield page.id, page.title, revision.text or "", page.redirect
eprint(f"{articles} articles total")
def process_simple_page(info):
pid, title, text, redirect = info