Implement leon-wiki-graph command
This commit is contained in:
parent
60ba7721db
commit
8605f8d43f
5 changed files with 165 additions and 9 deletions
63
brood/Cargo.lock
generated
63
brood/Cargo.lock
generated
|
|
@ -42,12 +42,25 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ciborium",
|
"ciborium",
|
||||||
"clap",
|
"clap",
|
||||||
|
"csv",
|
||||||
"rustc-hash",
|
"rustc-hash",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"simd-json",
|
"simd-json",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bstr"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
|
@ -118,6 +131,28 @@ dependencies = [
|
||||||
"os_str_bytes",
|
"os_str_bytes",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv"
|
||||||
|
version = "1.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
|
||||||
|
dependencies = [
|
||||||
|
"bstr",
|
||||||
|
"csv-core",
|
||||||
|
"itoa 0.4.8",
|
||||||
|
"ryu",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv-core"
|
||||||
|
version = "0.1.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "float-cmp"
|
name = "float-cmp"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
|
|
@ -178,18 +213,36 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "0.4.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.3"
|
version = "1.0.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
|
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lazy_static"
|
||||||
|
version = "1.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.134"
|
version = "0.2.134"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
|
checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-traits"
|
name = "num-traits"
|
||||||
version = "0.2.15"
|
version = "0.2.15"
|
||||||
|
|
@ -253,6 +306,12 @@ dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.1.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustc-hash"
|
name = "rustc-hash"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
|
|
@ -291,7 +350,7 @@ version = "1.0.85"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
|
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa",
|
"itoa 1.0.3",
|
||||||
"ryu",
|
"ryu",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
@ -355,7 +414,7 @@ checksum = "c0a635407649b66e125e4d2ffd208153210179f8c7c8b71c030aa2ad3eeb4c8f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"float-cmp",
|
"float-cmp",
|
||||||
"halfbrown",
|
"halfbrown",
|
||||||
"itoa",
|
"itoa 1.0.3",
|
||||||
"ryu",
|
"ryu",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ edition = "2021"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
ciborium = "0.2.0"
|
ciborium = "0.2.0"
|
||||||
clap = { version = "4.0.5", features = ["derive"] }
|
clap = { version = "4.0.5", features = ["derive"] }
|
||||||
|
csv = "1.1.6"
|
||||||
rustc-hash = "1.1.0"
|
rustc-hash = "1.1.0"
|
||||||
serde = { version = "1.0.145", features = ["derive"] }
|
serde = { version = "1.0.145", features = ["derive"] }
|
||||||
serde_json = "1.0.85"
|
serde_json = "1.0.85"
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
pub mod ingest;
|
pub mod ingest;
|
||||||
|
pub mod leon_wiki_graph;
|
||||||
pub mod list_pages;
|
pub mod list_pages;
|
||||||
pub mod longest_shortest_path;
|
pub mod longest_shortest_path;
|
||||||
pub mod path;
|
pub mod path;
|
||||||
|
|
|
||||||
94
brood/src/commands/leon_wiki_graph.rs
Normal file
94
brood/src/commands/leon_wiki_graph.rs
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufReader, BufWriter};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::data::AdjacencyList;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Article {
|
||||||
|
title: String,
|
||||||
|
url: String,
|
||||||
|
language: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct NodeRow {
|
||||||
|
id: u32,
|
||||||
|
label: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct EdgeRow {
|
||||||
|
source: u32,
|
||||||
|
target: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(datafile: &Path, articlesfile: &Path, language: &str) -> io::Result<()> {
|
||||||
|
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||||
|
let data = AdjacencyList::read(&mut databuf)?;
|
||||||
|
|
||||||
|
let articlesbuf = BufReader::new(File::open(articlesfile)?);
|
||||||
|
let articles: Vec<Article> =
|
||||||
|
simd_json::from_reader(articlesbuf).expect("failed to parse articles file");
|
||||||
|
|
||||||
|
let titles = articles
|
||||||
|
.into_iter()
|
||||||
|
.filter(|a| a.language == language)
|
||||||
|
.map(|a| a.title)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
let page_ids = data
|
||||||
|
.pages
|
||||||
|
.split_last()
|
||||||
|
.unwrap()
|
||||||
|
.1
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, p)| titles.contains(&p.data.title))
|
||||||
|
.map(|(i, _)| i as u32)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let mut node_rows = vec![];
|
||||||
|
for i in &page_ids {
|
||||||
|
let page = data.page(*i);
|
||||||
|
let row = NodeRow {
|
||||||
|
id: *i,
|
||||||
|
label: page.data.title.clone(),
|
||||||
|
};
|
||||||
|
node_rows.push(row);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut edge_rows = vec![];
|
||||||
|
for i in &page_ids {
|
||||||
|
let links = data
|
||||||
|
.link_range(*i)
|
||||||
|
.map(|li| data.link(li).to)
|
||||||
|
.filter(|to| page_ids.contains(to))
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
for to in links {
|
||||||
|
let row = EdgeRow {
|
||||||
|
source: *i,
|
||||||
|
target: to,
|
||||||
|
};
|
||||||
|
edge_rows.push(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let node_writer = BufWriter::new(File::create("nodes.csv")?);
|
||||||
|
let mut node_writer = csv::Writer::from_writer(node_writer);
|
||||||
|
for node in node_rows {
|
||||||
|
node_writer.serialize(node).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let edge_writer = BufWriter::new(File::create("edges.csv")?);
|
||||||
|
let mut edge_writer = csv::Writer::from_writer(edge_writer);
|
||||||
|
for edge in edge_rows {
|
||||||
|
edge_writer.serialize(edge).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -12,9 +12,7 @@ enum Command {
|
||||||
/// Read sift data on stdin and output brood data.
|
/// Read sift data on stdin and output brood data.
|
||||||
Ingest,
|
Ingest,
|
||||||
/// Read and reexport brood data.
|
/// Read and reexport brood data.
|
||||||
Reexport {
|
Reexport { to: PathBuf },
|
||||||
to: PathBuf,
|
|
||||||
},
|
|
||||||
/// Find a path from one article to another.
|
/// Find a path from one article to another.
|
||||||
Path {
|
Path {
|
||||||
from: String,
|
from: String,
|
||||||
|
|
@ -24,11 +22,11 @@ enum Command {
|
||||||
flip: bool,
|
flip: bool,
|
||||||
},
|
},
|
||||||
/// Find the longest shortest path starting at an article.
|
/// Find the longest shortest path starting at an article.
|
||||||
LongestShortestPath {
|
LongestShortestPath { from: String },
|
||||||
from: String,
|
/// Print all page titles.
|
||||||
},
|
|
||||||
// Print all page titles.
|
|
||||||
ListPages,
|
ListPages,
|
||||||
|
/// Construct wikipedia article graph for Leon.
|
||||||
|
LeonWikiGraph { articles: PathBuf, language:String },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Parser)]
|
#[derive(Debug, Parser)]
|
||||||
|
|
@ -54,5 +52,8 @@ fn main() -> io::Result<()> {
|
||||||
commands::longest_shortest_path::run(&args.datafile, &from)
|
commands::longest_shortest_path::run(&args.datafile, &from)
|
||||||
}
|
}
|
||||||
Command::ListPages => commands::list_pages::run(&args.datafile),
|
Command::ListPages => commands::list_pages::run(&args.datafile),
|
||||||
|
Command::LeonWikiGraph { articles ,language} => {
|
||||||
|
commands::leon_wiki_graph::run(&args.datafile, &articles,&language)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue