From 8605f8d43ff8af47159791b2bc8c553520e6add1 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 1 Nov 2022 01:22:15 +0100 Subject: [PATCH] Implement leon-wiki-graph command --- brood/Cargo.lock | 63 +++++++++++++++++- brood/Cargo.toml | 1 + brood/src/commands.rs | 1 + brood/src/commands/leon_wiki_graph.rs | 94 +++++++++++++++++++++++++++ brood/src/main.rs | 15 +++-- 5 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 brood/src/commands/leon_wiki_graph.rs diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 3cbe482..95bce17 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -42,12 +42,25 @@ version = "0.1.0" dependencies = [ "ciborium", "clap", + "csv", "rustc-hash", "serde", "serde_json", "simd-json", ] +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -118,6 +131,28 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "float-cmp" version = "0.9.0" @@ -178,18 +213,36 @@ dependencies = [ "libc", ] +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + [[package]] name = "itoa" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.134" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "num-traits" version = "0.2.15" @@ -253,6 +306,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + [[package]] name = "rustc-hash" version = "1.1.0" @@ -291,7 +350,7 @@ version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" dependencies = [ - "itoa", + "itoa 1.0.3", "ryu", "serde", ] @@ -355,7 +414,7 @@ checksum = "c0a635407649b66e125e4d2ffd208153210179f8c7c8b71c030aa2ad3eeb4c8f" dependencies = [ "float-cmp", "halfbrown", - "itoa", + "itoa 1.0.3", "ryu", ] diff --git a/brood/Cargo.toml b/brood/Cargo.toml index c42fb6c..d05e7f8 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] ciborium = "0.2.0" clap = { version = "4.0.5", features = ["derive"] } +csv = "1.1.6" rustc-hash = "1.1.0" serde = { version = "1.0.145", features = ["derive"] } serde_json = "1.0.85" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 02d47b9..6e64354 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,4 +1,5 @@ pub mod ingest; +pub mod leon_wiki_graph; pub mod list_pages; pub mod longest_shortest_path; pub mod path; diff --git a/brood/src/commands/leon_wiki_graph.rs b/brood/src/commands/leon_wiki_graph.rs new file mode 100644 index 0000000..72cac4a --- /dev/null +++ b/brood/src/commands/leon_wiki_graph.rs @@ -0,0 +1,94 @@ +use std::collections::HashSet; +use std::fs::File; +use std::io::{self, BufReader, BufWriter}; +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +use crate::data::AdjacencyList; + +#[derive(Deserialize)] +struct Article { + title: String, + url: String, + language: String, +} + +#[derive(Serialize)] +struct NodeRow { + id: u32, + label: String, +} + +#[derive(Serialize)] +struct EdgeRow { + source: u32, + target: u32, +} + +pub fn run(datafile: &Path, articlesfile: &Path, language: &str) -> io::Result<()> { + let mut databuf = BufReader::new(File::open(datafile)?); + let data = AdjacencyList::read(&mut databuf)?; + + let articlesbuf = BufReader::new(File::open(articlesfile)?); + let articles: Vec
= + simd_json::from_reader(articlesbuf).expect("failed to parse articles file"); + + let titles = articles + .into_iter() + .filter(|a| a.language == language) + .map(|a| a.title) + .collect::>(); + + let page_ids = data + .pages + .split_last() + .unwrap() + .1 + .iter() + .enumerate() + .filter(|(_, p)| titles.contains(&p.data.title)) + .map(|(i, _)| i as u32) + .collect::>(); + + let mut node_rows = vec![]; + for i in &page_ids { + let page = data.page(*i); + let row = NodeRow { + id: *i, + label: page.data.title.clone(), + }; + node_rows.push(row); + } + + let mut edge_rows = vec![]; + for i in &page_ids { + let links = data + .link_range(*i) + .map(|li| data.link(li).to) + .filter(|to| page_ids.contains(to)) + .collect::>(); + + for to in links { + let row = EdgeRow { + source: *i, + target: to, + }; + edge_rows.push(row); + } + } + + let node_writer = BufWriter::new(File::create("nodes.csv")?); + let mut node_writer = csv::Writer::from_writer(node_writer); + for node in node_rows { + node_writer.serialize(node).unwrap(); + } + + let edge_writer = BufWriter::new(File::create("edges.csv")?); + let mut edge_writer = csv::Writer::from_writer(edge_writer); + for edge in edge_rows { + edge_writer.serialize(edge).unwrap(); + } + + Ok(()) +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 25c8f56..6b34767 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -12,9 +12,7 @@ enum Command { /// Read sift data on stdin and output brood data. Ingest, /// Read and reexport brood data. - Reexport { - to: PathBuf, - }, + Reexport { to: PathBuf }, /// Find a path from one article to another. Path { from: String, @@ -24,11 +22,11 @@ enum Command { flip: bool, }, /// Find the longest shortest path starting at an article. - LongestShortestPath { - from: String, - }, - // Print all page titles. + LongestShortestPath { from: String }, + /// Print all page titles. ListPages, + /// Construct wikipedia article graph for Leon. + LeonWikiGraph { articles: PathBuf, language:String }, } #[derive(Debug, Parser)] @@ -54,5 +52,8 @@ fn main() -> io::Result<()> { commands::longest_shortest_path::run(&args.datafile, &from) } Command::ListPages => commands::list_pages::run(&args.datafile), + Command::LeonWikiGraph { articles ,language} => { + commands::leon_wiki_graph::run(&args.datafile, &articles,&language) + } } }