Implement leon-wiki-graph command

This commit is contained in:
Joscha 2022-11-01 01:22:15 +01:00
parent 60ba7721db
commit 8605f8d43f
5 changed files with 165 additions and 9 deletions

63
brood/Cargo.lock generated
View file

@ -42,12 +42,25 @@ version = "0.1.0"
dependencies = [ dependencies = [
"ciborium", "ciborium",
"clap", "clap",
"csv",
"rustc-hash", "rustc-hash",
"serde", "serde",
"serde_json", "serde_json",
"simd-json", "simd-json",
] ]
[[package]]
name = "bstr"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -118,6 +131,28 @@ dependencies = [
"os_str_bytes", "os_str_bytes",
] ]
[[package]]
name = "csv"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
dependencies = [
"bstr",
"csv-core",
"itoa 0.4.8",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "float-cmp" name = "float-cmp"
version = "0.9.0" version = "0.9.0"
@ -178,18 +213,36 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "1.0.3" version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.134" version = "0.2.134"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]] [[package]]
name = "num-traits" name = "num-traits"
version = "0.2.15" version = "0.2.15"
@ -253,6 +306,12 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "regex-automata"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]] [[package]]
name = "rustc-hash" name = "rustc-hash"
version = "1.1.0" version = "1.1.0"
@ -291,7 +350,7 @@ version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
dependencies = [ dependencies = [
"itoa", "itoa 1.0.3",
"ryu", "ryu",
"serde", "serde",
] ]
@ -355,7 +414,7 @@ checksum = "c0a635407649b66e125e4d2ffd208153210179f8c7c8b71c030aa2ad3eeb4c8f"
dependencies = [ dependencies = [
"float-cmp", "float-cmp",
"halfbrown", "halfbrown",
"itoa", "itoa 1.0.3",
"ryu", "ryu",
] ]

View file

@ -6,6 +6,7 @@ edition = "2021"
[dependencies] [dependencies]
ciborium = "0.2.0" ciborium = "0.2.0"
clap = { version = "4.0.5", features = ["derive"] } clap = { version = "4.0.5", features = ["derive"] }
csv = "1.1.6"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
serde = { version = "1.0.145", features = ["derive"] } serde = { version = "1.0.145", features = ["derive"] }
serde_json = "1.0.85" serde_json = "1.0.85"

View file

@ -1,4 +1,5 @@
pub mod ingest; pub mod ingest;
pub mod leon_wiki_graph;
pub mod list_pages; pub mod list_pages;
pub mod longest_shortest_path; pub mod longest_shortest_path;
pub mod path; pub mod path;

View file

@ -0,0 +1,94 @@
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, BufReader, BufWriter};
use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::data::AdjacencyList;
#[derive(Deserialize)]
struct Article {
title: String,
url: String,
language: String,
}
#[derive(Serialize)]
struct NodeRow {
id: u32,
label: String,
}
#[derive(Serialize)]
struct EdgeRow {
source: u32,
target: u32,
}
pub fn run(datafile: &Path, articlesfile: &Path, language: &str) -> io::Result<()> {
let mut databuf = BufReader::new(File::open(datafile)?);
let data = AdjacencyList::read(&mut databuf)?;
let articlesbuf = BufReader::new(File::open(articlesfile)?);
let articles: Vec<Article> =
simd_json::from_reader(articlesbuf).expect("failed to parse articles file");
let titles = articles
.into_iter()
.filter(|a| a.language == language)
.map(|a| a.title)
.collect::<HashSet<_>>();
let page_ids = data
.pages
.split_last()
.unwrap()
.1
.iter()
.enumerate()
.filter(|(_, p)| titles.contains(&p.data.title))
.map(|(i, _)| i as u32)
.collect::<Vec<_>>();
let mut node_rows = vec![];
for i in &page_ids {
let page = data.page(*i);
let row = NodeRow {
id: *i,
label: page.data.title.clone(),
};
node_rows.push(row);
}
let mut edge_rows = vec![];
for i in &page_ids {
let links = data
.link_range(*i)
.map(|li| data.link(li).to)
.filter(|to| page_ids.contains(to))
.collect::<HashSet<_>>();
for to in links {
let row = EdgeRow {
source: *i,
target: to,
};
edge_rows.push(row);
}
}
let node_writer = BufWriter::new(File::create("nodes.csv")?);
let mut node_writer = csv::Writer::from_writer(node_writer);
for node in node_rows {
node_writer.serialize(node).unwrap();
}
let edge_writer = BufWriter::new(File::create("edges.csv")?);
let mut edge_writer = csv::Writer::from_writer(edge_writer);
for edge in edge_rows {
edge_writer.serialize(edge).unwrap();
}
Ok(())
}

View file

@ -12,9 +12,7 @@ enum Command {
/// Read sift data on stdin and output brood data. /// Read sift data on stdin and output brood data.
Ingest, Ingest,
/// Read and reexport brood data. /// Read and reexport brood data.
Reexport { Reexport { to: PathBuf },
to: PathBuf,
},
/// Find a path from one article to another. /// Find a path from one article to another.
Path { Path {
from: String, from: String,
@ -24,11 +22,11 @@ enum Command {
flip: bool, flip: bool,
}, },
/// Find the longest shortest path starting at an article. /// Find the longest shortest path starting at an article.
LongestShortestPath { LongestShortestPath { from: String },
from: String, /// Print all page titles.
},
// Print all page titles.
ListPages, ListPages,
/// Construct wikipedia article graph for Leon.
LeonWikiGraph { articles: PathBuf, language:String },
} }
#[derive(Debug, Parser)] #[derive(Debug, Parser)]
@ -54,5 +52,8 @@ fn main() -> io::Result<()> {
commands::longest_shortest_path::run(&args.datafile, &from) commands::longest_shortest_path::run(&args.datafile, &from)
} }
Command::ListPages => commands::list_pages::run(&args.datafile), Command::ListPages => commands::list_pages::run(&args.datafile),
Command::LeonWikiGraph { articles ,language} => {
commands::leon_wiki_graph::run(&args.datafile, &articles,&language)
}
} }
} }